]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.h
update sources to 12.2.10
[ceph.git] / ceph / src / osd / PG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PG_H
16#define CEPH_PG_H
17
18#include <boost/statechart/custom_reaction.hpp>
19#include <boost/statechart/event.hpp>
20#include <boost/statechart/simple_state.hpp>
21#include <boost/statechart/state.hpp>
22#include <boost/statechart/state_machine.hpp>
23#include <boost/statechart/transition.hpp>
24#include <boost/statechart/event_base.hpp>
25#include <boost/scoped_ptr.hpp>
26#include <boost/circular_buffer.hpp>
91327a77 27#include <boost/container/flat_set.hpp>
7c673cae
FG
28#include "include/memory.h"
29#include "include/mempool.h"
30
31// re-include our assert to clobber boost's
32#include "include/assert.h"
33
34#include "include/types.h"
35#include "include/stringify.h"
36#include "osd_types.h"
37#include "include/xlist.h"
38#include "SnapMapper.h"
39#include "Session.h"
40#include "common/Timer.h"
41
42#include "PGLog.h"
43#include "OSDMap.h"
44#include "messages/MOSDPGLog.h"
45#include "include/str_list.h"
46#include "PGBackend.h"
47
48#include <atomic>
49#include <list>
50#include <memory>
51#include <stack>
52#include <string>
53#include <tuple>
54using namespace std;
55
56// #include "include/unordered_map.h"
57// #include "include/unordered_set.h"
58
59//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
60
61class OSD;
62class OSDService;
63class MOSDOp;
64class MOSDPGScan;
65class MOSDPGBackfill;
66class MOSDPGInfo;
67
68class PG;
69struct OpRequest;
70typedef OpRequest::Ref OpRequestRef;
71class MOSDPGLog;
72class CephContext;
73
74namespace Scrub {
75 class Store;
76}
77
78void intrusive_ptr_add_ref(PG *pg);
79void intrusive_ptr_release(PG *pg);
80
81using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
82using embedded_state = std::pair<utime_t, const char*>;
83
84struct PGStateInstance {
85 // Time spent in pg states
86
87 void setepoch(const epoch_t current_epoch) {
88 this_epoch = current_epoch;
89 }
90
91 void enter_state(const utime_t entime, const char* state) {
92 embedded_states.push(std::make_pair(entime, state));
93 }
94
95 void exit_state(const utime_t extime) {
96 embedded_state this_state = embedded_states.top();
97 state_history.push_back(state_history_entry{
98 this_state.first, extime, this_state.second});
99 embedded_states.pop();
100 }
101
102 epoch_t this_epoch;
103 utime_t enter_time;
104 std::vector<state_history_entry> state_history;
105 std::stack<embedded_state> embedded_states;
106};
107
108class PGStateHistory {
109 // Member access protected with the PG lock
110public:
111 PGStateHistory() : buffer(10) {}
112
113 void enter(PG* pg, const utime_t entime, const char* state);
114
115 void exit(const char* state);
116
117 void reset() {
118 pi = nullptr;
119 }
120
121 void set_pg_in_destructor() { pg_in_destructor = true; }
122
123 void dump(Formatter* f) const;
124
125private:
126 bool pg_in_destructor = false;
127 PG* thispg = nullptr;
128 std::unique_ptr<PGStateInstance> tmppi;
129 PGStateInstance* pi = nullptr;
130 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
131
132};
133
134#ifdef PG_DEBUG_REFS
135#include "common/tracked_int_ptr.hpp"
136 uint64_t get_with_id(PG *pg);
137 void put_with_id(PG *pg, uint64_t id);
138 typedef TrackedIntPtr<PG> PGRef;
139#else
140 typedef boost::intrusive_ptr<PG> PGRef;
141#endif
142
143class PGRecoveryStats {
144 struct per_state_info {
145 uint64_t enter, exit; // enter/exit counts
146 uint64_t events;
147 utime_t event_time; // time spent processing events
148 utime_t total_time; // total time in state
149 utime_t min_time, max_time;
150
151 // cppcheck-suppress unreachableCode
152 per_state_info() : enter(0), exit(0), events(0) {}
153 };
154 map<const char *,per_state_info> info;
155 Mutex lock;
156
157 public:
158 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
159
160 void reset() {
161 Mutex::Locker l(lock);
162 info.clear();
163 }
164 void dump(ostream& out) {
165 Mutex::Locker l(lock);
166 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
167 per_state_info& i = p->second;
168 out << i.enter << "\t" << i.exit << "\t"
169 << i.events << "\t" << i.event_time << "\t"
170 << i.total_time << "\t"
171 << i.min_time << "\t" << i.max_time << "\t"
172 << p->first << "\n";
173 }
174 }
175
176 void dump_formatted(Formatter *f) {
177 Mutex::Locker l(lock);
178 f->open_array_section("pg_recovery_stats");
179 for (map<const char *,per_state_info>::iterator p = info.begin();
180 p != info.end(); ++p) {
181 per_state_info& i = p->second;
182 f->open_object_section("recovery_state");
183 f->dump_int("enter", i.enter);
184 f->dump_int("exit", i.exit);
185 f->dump_int("events", i.events);
186 f->dump_stream("event_time") << i.event_time;
187 f->dump_stream("total_time") << i.total_time;
188 f->dump_stream("min_time") << i.min_time;
189 f->dump_stream("max_time") << i.max_time;
190 vector<string> states;
191 get_str_vec(p->first, "/", states);
192 f->open_array_section("nested_states");
193 for (vector<string>::iterator st = states.begin();
194 st != states.end(); ++st) {
195 f->dump_string("state", *st);
196 }
197 f->close_section();
198 f->close_section();
199 }
200 f->close_section();
201 }
202
203 void log_enter(const char *s) {
204 Mutex::Locker l(lock);
205 info[s].enter++;
206 }
207 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
208 Mutex::Locker l(lock);
209 per_state_info &i = info[s];
210 i.exit++;
211 i.total_time += dur;
212 if (dur > i.max_time)
213 i.max_time = dur;
214 if (dur < i.min_time || i.min_time == utime_t())
215 i.min_time = dur;
216 i.events += events;
217 i.event_time += event_dur;
218 }
219};
220
221struct PGPool {
222 CephContext* cct;
223 epoch_t cached_epoch;
224 int64_t id;
225 string name;
226 uint64_t auid;
227
228 pg_pool_t info;
229 SnapContext snapc; // the default pool snapc, ready to go.
230
231 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
232 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
233
234 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
235 : cct(cct),
236 cached_epoch(map->get_epoch()),
237 id(i),
238 name(map->get_pool_name(id)),
239 auid(map->get_pg_pool(id)->auid) {
240 const pg_pool_t *pi = map->get_pg_pool(id);
241 assert(pi);
242 info = *pi;
243 snapc = pi->get_snap_context();
244 pi->build_removed_snaps(cached_removed_snaps);
245 }
246
247 void update(OSDMapRef map);
248};
249
250/** PG - Replica Placement Group
251 *
252 */
253
254class PG : public DoutPrefixProvider {
255protected:
256 OSDService *osd;
257 CephContext *cct;
258 OSDriver osdriver;
259 SnapMapper snap_mapper;
224ce89b 260 bool eio_errors_to_process = false;
7c673cae
FG
261
262 virtual PGBackend *get_pgbackend() = 0;
263public:
264 std::string gen_prefix() const override;
265 CephContext *get_cct() const override { return cct; }
266 unsigned get_subsys() const override { return ceph_subsys_osd; }
267
268 /*** PG ****/
269 void update_snap_mapper_bits(uint32_t bits) {
270 snap_mapper.update_bits(bits);
271 }
272 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
273 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
274 return get_pgbackend()->get_is_recoverable_predicate();
275 }
276protected:
277 OSDMapRef osdmap_ref;
278 OSDMapRef last_persisted_osdmap_ref;
279 PGPool pool;
280
281 void requeue_map_waiters();
282
283 void update_osdmap_ref(OSDMapRef newmap) {
284 assert(_lock.is_locked_by_me());
285 osdmap_ref = std::move(newmap);
286 }
287
288public:
289 OSDMapRef get_osdmap() const {
290 assert(is_locked());
291 assert(osdmap_ref);
292 return osdmap_ref;
293 }
294protected:
295
296 /** locking and reference counting.
297 * I destroy myself when the reference count hits zero.
298 * lock() should be called before doing anything.
299 * get() should be called on pointer copy (to another thread, etc.).
300 * put() should be called on destruction of some previously copied pointer.
301 * unlock() when done with the current pointer (_most common_).
302 */
303 mutable Mutex _lock;
304 std::atomic_uint ref{0};
305
306#ifdef PG_DEBUG_REFS
307 Mutex _ref_id_lock;
308 map<uint64_t, string> _live_ids;
309 map<string, uint64_t> _tag_counts;
310 uint64_t _ref_id;
311#endif
312
313public:
314 bool deleting; // true while in removing or OSD is shutting down
315
316 ZTracer::Endpoint trace_endpoint;
317
318 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
319 void lock(bool no_lockdep = false) const;
320 void unlock() const {
321 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
322 assert(!dirty_info);
323 assert(!dirty_big_info);
324 _lock.Unlock();
325 }
326
327 bool is_locked() const {
328 return _lock.is_locked();
329 }
330
331#ifdef PG_DEBUG_REFS
332 uint64_t get_with_id();
333 void put_with_id(uint64_t);
334 void dump_live_ids();
335#endif
336 void get(const char* tag);
337 void put(const char* tag);
338
339 bool dirty_info, dirty_big_info;
340
341public:
342 bool is_ec_pg() const {
343 return pool.info.ec_pool();
344 }
345 // pg state
346 pg_info_t info; ///< current pg info
347 pg_info_t last_written_info; ///< last written info
348 __u8 info_struct_v;
349 static const __u8 cur_struct_v = 10;
350 // v10 is the new past_intervals encoding
351 // v9 was fastinfo_key addition
352 // v8 was the move to a per-pg pgmeta object
353 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
354 // (first appeared in cuttlefish).
355 static const __u8 compat_struct_v = 7;
356 bool must_upgrade() {
357 return info_struct_v < cur_struct_v;
358 }
359 bool can_upgrade() {
360 return info_struct_v >= compat_struct_v;
361 }
362 void upgrade(ObjectStore *store);
363
364 const coll_t coll;
365 ObjectStore::CollectionHandle ch;
366 PGLog pg_log;
367 static string get_info_key(spg_t pgid) {
368 return stringify(pgid) + "_info";
369 }
370 static string get_biginfo_key(spg_t pgid) {
371 return stringify(pgid) + "_biginfo";
372 }
373 static string get_epoch_key(spg_t pgid) {
374 return stringify(pgid) + "_epoch";
375 }
376 ghobject_t pgmeta_oid;
377
91327a77
AA
378 // ------------------
379 // MissingLoc
380
7c673cae 381 class MissingLoc {
91327a77
AA
382 public:
383 // a loc_count indicates how many locations we know in each of
384 // these distinct sets
385 struct loc_count_t {
386 int up = 0; //< up
387 int other = 0; //< other
388
389 friend bool operator<(const loc_count_t& l,
390 const loc_count_t& r) {
391 return (l.up < r.up ||
392 (l.up == r.up &&
393 (l.other < r.other)));
394 }
395 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
396 assert(l.up >= 0);
397 assert(l.other >= 0);
398 return out << "(" << l.up << "+" << l.other << ")";
399 }
400 };
401
402
403 private:
404
405 loc_count_t _get_count(const set<pg_shard_t>& shards) {
406 loc_count_t r;
407 for (auto s : shards) {
408 if (pg->upset.count(s)) {
409 r.up++;
410 } else {
411 r.other++;
412 }
413 }
414 return r;
415 }
416
7c673cae
FG
417 map<hobject_t, pg_missing_item> needs_recovery_map;
418 map<hobject_t, set<pg_shard_t> > missing_loc;
419 set<pg_shard_t> missing_loc_sources;
91327a77
AA
420
421 // for every entry in missing_loc, we count how many of each type of shard we have,
422 // and maintain totals here. The sum of the values for this map will always equal
423 // missing_loc.size().
424 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
425
426 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
427 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
428 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
429 // For completely missing shards initialize with empty set<pg_shard_t>
430 for (int i = 0 ; i < num_shards ; ++i) {
431 shard_id_t shard(i);
432 pgsbs[shard];
433 }
434 for (auto pgs: s)
435 pgsbs[pgs.shard].insert(pgs);
436 } else {
437 pgsbs[shard_id_t::NO_SHARD] = s;
438 }
439 }
440
441 void _inc_count(const set<pg_shard_t>& s) {
442 map< shard_id_t, set<pg_shard_t> > pgsbs;
443 pgs_by_shard_id(s, pgsbs);
444 for (auto shard: pgsbs)
445 ++missing_by_count[shard.first][_get_count(shard.second)];
446 }
447 void _dec_count(const set<pg_shard_t>& s) {
448 map< shard_id_t, set<pg_shard_t> > pgsbs;
449 pgs_by_shard_id(s, pgsbs);
450 for (auto shard: pgsbs) {
451 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
452 assert(p != missing_by_count[shard.first].end());
453 if (--p->second == 0) {
454 missing_by_count[shard.first].erase(p);
455 }
456 }
457 }
458
7c673cae
FG
459 PG *pg;
460 set<pg_shard_t> empty_set;
461 public:
462 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
463 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
464 explicit MissingLoc(PG *pg)
91327a77 465 : pg(pg) { }
7c673cae
FG
466 void set_backend_predicates(
467 IsPGReadablePredicate *_is_readable,
468 IsPGRecoverablePredicate *_is_recoverable) {
469 is_readable.reset(_is_readable);
470 is_recoverable.reset(_is_recoverable);
471 }
472 string gen_prefix() const { return pg->gen_prefix(); }
473 bool needs_recovery(
474 const hobject_t &hoid,
475 eversion_t *v = 0) const {
476 map<hobject_t, pg_missing_item>::const_iterator i =
477 needs_recovery_map.find(hoid);
478 if (i == needs_recovery_map.end())
479 return false;
480 if (v)
481 *v = i->second.need;
482 return true;
483 }
c07f9fc5
FG
484 bool is_deleted(const hobject_t &hoid) const {
485 auto i = needs_recovery_map.find(hoid);
486 if (i == needs_recovery_map.end())
487 return false;
488 return i->second.is_delete();
489 }
7c673cae 490 bool is_unfound(const hobject_t &hoid) const {
c07f9fc5 491 return needs_recovery(hoid) && !is_deleted(hoid) && (
7c673cae
FG
492 !missing_loc.count(hoid) ||
493 !(*is_recoverable)(missing_loc.find(hoid)->second));
494 }
495 bool readable_with_acting(
496 const hobject_t &hoid,
497 const set<pg_shard_t> &acting) const;
498 uint64_t num_unfound() const {
499 uint64_t ret = 0;
500 for (map<hobject_t, pg_missing_item>::const_iterator i =
501 needs_recovery_map.begin();
502 i != needs_recovery_map.end();
503 ++i) {
504 if (is_unfound(i->first))
505 ++ret;
506 }
507 return ret;
508 }
509
510 bool have_unfound() const {
511 for (map<hobject_t, pg_missing_item>::const_iterator i =
512 needs_recovery_map.begin();
513 i != needs_recovery_map.end();
514 ++i) {
515 if (is_unfound(i->first))
516 return true;
517 }
518 return false;
519 }
520 void clear() {
521 needs_recovery_map.clear();
522 missing_loc.clear();
523 missing_loc_sources.clear();
91327a77 524 missing_by_count.clear();
7c673cae
FG
525 }
526
527 void add_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
528 auto p = missing_loc.find(hoid);
529 if (p == missing_loc.end()) {
530 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
531 } else {
532 _dec_count(p->second);
533 }
534 p->second.insert(location);
535 _inc_count(p->second);
7c673cae
FG
536 }
537 void remove_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
538 auto p = missing_loc.find(hoid);
539 if (p != missing_loc.end()) {
540 _dec_count(p->second);
541 p->second.erase(location);
542 _inc_count(p->second);
543 }
7c673cae
FG
544 }
545 void add_active_missing(const pg_missing_t &missing) {
546 for (map<hobject_t, pg_missing_item>::const_iterator i =
547 missing.get_items().begin();
548 i != missing.get_items().end();
549 ++i) {
550 map<hobject_t, pg_missing_item>::const_iterator j =
551 needs_recovery_map.find(i->first);
552 if (j == needs_recovery_map.end()) {
553 needs_recovery_map.insert(*i);
554 } else {
c07f9fc5
FG
555 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
556 << i->first << " have " << j->second
557 << " tried to add " << i->second << dendl;
7c673cae
FG
558 assert(i->second.need == j->second.need);
559 }
560 }
561 }
562
563 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
564 needs_recovery_map[hoid] = pg_missing_item(need, have);
565 }
566 void revise_need(const hobject_t &hoid, eversion_t need) {
567 assert(needs_recovery(hoid));
568 needs_recovery_map[hoid].need = need;
569 }
570
571 /// Adds info about a possible recovery source
572 bool add_source_info(
573 pg_shard_t source, ///< [in] source
574 const pg_info_t &oinfo, ///< [in] info
575 const pg_missing_t &omissing, ///< [in] (optional) missing
576 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
577 ); ///< @return whether a new object location was discovered
578
579 /// Adds recovery sources in batch
580 void add_batch_sources_info(
581 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
582 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
583 );
584
585 /// Uses osdmap to update structures for now down sources
586 void check_recovery_sources(const OSDMapRef& osdmap);
587
588 /// Call when hoid is no longer missing in acting set
589 void recovered(const hobject_t &hoid) {
590 needs_recovery_map.erase(hoid);
91327a77
AA
591 auto p = missing_loc.find(hoid);
592 if (p != missing_loc.end()) {
593 _dec_count(p->second);
594 missing_loc.erase(p);
595 }
7c673cae
FG
596 }
597
598 /// Call to update structures for hoid after a change
599 void rebuild(
600 const hobject_t &hoid,
601 pg_shard_t self,
602 const set<pg_shard_t> to_recover,
603 const pg_info_t &info,
604 const pg_missing_t &missing,
605 const map<pg_shard_t, pg_missing_t> &pmissing,
606 const map<pg_shard_t, pg_info_t> &pinfo) {
607 recovered(hoid);
608 boost::optional<pg_missing_item> item;
609 auto miter = missing.get_items().find(hoid);
610 if (miter != missing.get_items().end()) {
611 item = miter->second;
612 } else {
613 for (auto &&i: to_recover) {
614 if (i == self)
615 continue;
616 auto pmiter = pmissing.find(i);
617 assert(pmiter != pmissing.end());
618 miter = pmiter->second.get_items().find(hoid);
619 if (miter != pmiter->second.get_items().end()) {
620 item = miter->second;
621 break;
622 }
623 }
624 }
625 if (!item)
626 return; // recovered!
627
628 needs_recovery_map[hoid] = *item;
c07f9fc5
FG
629 if (item->is_delete())
630 return;
7c673cae
FG
631 auto mliter =
632 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
633 assert(info.last_backfill.is_max());
634 assert(info.last_update >= item->need);
635 if (!missing.is_missing(hoid))
636 mliter->second.insert(self);
637 for (auto &&i: pmissing) {
638 auto pinfoiter = pinfo.find(i.first);
639 assert(pinfoiter != pinfo.end());
640 if (item->need <= pinfoiter->second.last_update &&
641 hoid <= pinfoiter->second.last_backfill &&
642 !i.second.is_missing(hoid))
643 mliter->second.insert(i.first);
644 }
91327a77 645 _inc_count(mliter->second);
7c673cae
FG
646 }
647
648 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
649 return missing_loc.count(hoid) ?
650 missing_loc.find(hoid)->second : empty_set;
651 }
652 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
653 return missing_loc;
654 }
655 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
656 return needs_recovery_map;
657 }
91327a77
AA
658 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
659 return missing_by_count;
660 }
7c673cae
FG
661 } missing_loc;
662
663 PastIntervals past_intervals;
664
665 interval_set<snapid_t> snap_trimq;
666
667 /* You should not use these items without taking their respective queue locks
668 * (if they have one) */
669 xlist<PG*>::item stat_queue_item;
670 bool scrub_queued;
671 bool recovery_queued;
672
673 int recovery_ops_active;
674 set<pg_shard_t> waiting_on_backfill;
675#ifdef DEBUG_RECOVERY_OIDS
676 set<hobject_t> recovering_oids;
677#endif
678
679protected:
680 int role; // 0 = primary, 1 = replica, -1=none.
681 unsigned state; // PG_STATE_*
682
683 bool send_notify; ///< true if we are non-primary and should notify the primary
684
685public:
686 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
687 eversion_t last_complete_ondisk; // last_complete that has committed.
688 eversion_t last_update_applied;
689
690
691 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
692 PGRef pg;
693 epoch_t e;
694 eversion_t v;
695 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
696 : pg(pg), e(e), v(v) {}
697 void finish(int) override {
698 pg->lock();
699 if (!pg->pg_has_reset_since(e)) {
700 pg->last_rollback_info_trimmed_to_applied = v;
701 }
702 pg->unlock();
703 }
704 };
705 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
706 // and the transaction has applied
707 eversion_t last_rollback_info_trimmed_to_applied;
708
709 // primary state
710 public:
711 pg_shard_t primary;
712 pg_shard_t pg_whoami;
713 pg_shard_t up_primary;
714 vector<int> up, acting, want_acting;
715 set<pg_shard_t> actingbackfill, actingset, upset;
716 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
717 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
718 eversion_t pg_trim_to;
719
720 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
721
722 // [primary only] content recovery state
723
724public:
725 struct BufferedRecoveryMessages {
726 map<int, map<spg_t, pg_query_t> > query_map;
727 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
728 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
729 };
730
94b18763
FG
731public:
732 bool dne() { return info.dne(); }
7c673cae
FG
733 struct RecoveryCtx {
734 utime_t start_time;
735 map<int, map<spg_t, pg_query_t> > *query_map;
736 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
737 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
738 set<PGRef> created_pgs;
739 C_Contexts *on_applied;
740 C_Contexts *on_safe;
741 ObjectStore::Transaction *transaction;
742 ThreadPool::TPHandle* handle;
743 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
744 map<int,
745 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
746 map<int,
747 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
748 C_Contexts *on_applied,
749 C_Contexts *on_safe,
750 ObjectStore::Transaction *transaction)
751 : query_map(query_map), info_map(info_map),
752 notify_list(notify_list),
753 on_applied(on_applied),
754 on_safe(on_safe),
755 transaction(transaction),
756 handle(NULL) {}
757
758 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
759 : query_map(&(buf.query_map)),
760 info_map(&(buf.info_map)),
761 notify_list(&(buf.notify_list)),
762 on_applied(rctx.on_applied),
763 on_safe(rctx.on_safe),
764 transaction(rctx.transaction),
765 handle(rctx.handle) {}
766
767 void accept_buffered_messages(BufferedRecoveryMessages &m) {
768 assert(query_map);
769 assert(info_map);
770 assert(notify_list);
771 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
772 i != m.query_map.end();
773 ++i) {
774 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
775 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
776 j != i->second.end();
777 ++j) {
778 omap[j->first] = j->second;
779 }
780 }
781 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
782 = m.info_map.begin();
783 i != m.info_map.end();
784 ++i) {
785 vector<pair<pg_notify_t, PastIntervals> > &ovec =
786 (*info_map)[i->first];
787 ovec.reserve(ovec.size() + i->second.size());
788 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
789 }
790 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
791 = m.notify_list.begin();
792 i != m.notify_list.end();
793 ++i) {
794 vector<pair<pg_notify_t, PastIntervals> > &ovec =
795 (*notify_list)[i->first];
796 ovec.reserve(ovec.size() + i->second.size());
797 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
798 }
799 }
1adf2230
AA
800
801 void send_notify(pg_shard_t to,
802 const pg_notify_t &info, const PastIntervals &pi) {
803 assert(notify_list);
804 (*notify_list)[to.osd].push_back(make_pair(info, pi));
805 }
7c673cae
FG
806 };
807
808
809 PGStateHistory pgstate_history;
810
811 struct NamedState {
812 const char *state_name;
813 utime_t enter_time;
814 PG* pg;
815 const char *get_state_name() { return state_name; }
816 NamedState(PG *pg_, const char *state_name_)
817 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
818 pg->pgstate_history.enter(pg, enter_time, state_name);
819 }
820 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
821 };
822
823
824
825protected:
826
827 /*
828 * peer_info -- projected (updates _before_ replicas ack)
829 * peer_missing -- committed (updates _after_ replicas ack)
830 */
831
832 bool need_up_thru;
833 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
834 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
835 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
836 set<pg_shard_t> peer_purged; // peers purged
837 map<pg_shard_t, pg_missing_t> peer_missing;
838 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
839 set<pg_shard_t> peer_missing_requested;
840
841 // i deleted these strays; ignore racing PGInfo from them
842 set<pg_shard_t> peer_activated;
843
844 // primary-only, recovery-only state
845 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
846 // which are unfound on the primary
847 epoch_t last_peering_reset;
848
849
850 /* heartbeat peers */
851 void set_probe_targets(const set<pg_shard_t> &probe_set);
852 void clear_probe_targets();
853public:
854 Mutex heartbeat_peer_lock;
855 set<int> heartbeat_peers;
856 set<int> probe_targets;
857
858 /**
859 * BackfillInterval
860 *
861 * Represents the objects in a range [begin, end)
862 *
863 * Possible states:
864 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
865 * 2) Else, objects contains all objects in [begin, end)
866 */
867 struct BackfillInterval {
868 // info about a backfill interval on a peer
869 eversion_t version; /// version at which the scan occurred
870 map<hobject_t,eversion_t> objects;
871 hobject_t begin;
872 hobject_t end;
873
874 /// clear content
875 void clear() {
876 *this = BackfillInterval();
877 }
878
879 /// clear objects list only
880 void clear_objects() {
881 objects.clear();
882 }
883
884 /// reinstantiate with a new start+end position and sort order
885 void reset(hobject_t start) {
886 clear();
887 begin = end = start;
888 }
889
890 /// true if there are no objects in this interval
891 bool empty() const {
892 return objects.empty();
893 }
894
895 /// true if interval extends to the end of the range
896 bool extends_to_end() const {
897 return end.is_max();
898 }
899
900 /// removes items <= soid and adjusts begin to the first object
901 void trim_to(const hobject_t &soid) {
902 trim();
903 while (!objects.empty() &&
904 objects.begin()->first <= soid) {
905 pop_front();
906 }
907 }
908
909 /// Adjusts begin to the first object
910 void trim() {
911 if (!objects.empty())
912 begin = objects.begin()->first;
913 else
914 begin = end;
915 }
916
917 /// drop first entry, and adjust @begin accordingly
918 void pop_front() {
919 assert(!objects.empty());
920 objects.erase(objects.begin());
921 trim();
922 }
923
924 /// dump
925 void dump(Formatter *f) const {
926 f->dump_stream("begin") << begin;
927 f->dump_stream("end") << end;
928 f->open_array_section("objects");
929 for (map<hobject_t, eversion_t>::const_iterator i =
930 objects.begin();
931 i != objects.end();
932 ++i) {
933 f->open_object_section("object");
934 f->dump_stream("object") << i->first;
935 f->dump_stream("version") << i->second;
936 f->close_section();
937 }
938 f->close_section();
939 }
940 };
941
942protected:
943 BackfillInterval backfill_info;
944 map<pg_shard_t, BackfillInterval> peer_backfill_info;
945 bool backfill_reserved;
946 bool backfill_reserving;
947
948 friend class OSD;
949
950public:
951 set<pg_shard_t> backfill_targets;
952
953 bool is_backfill_targets(pg_shard_t osd) {
954 return backfill_targets.count(osd);
955 }
956
957protected:
958
959 /*
960 * blocked request wait hierarchy
961 *
962 * In order to preserve request ordering we need to be careful about the
963 * order in which blocked requests get requeued. Generally speaking, we
964 * push the requests back up to the op_wq in reverse order (most recent
965 * request first) so that they come back out again in the original order.
966 * However, because there are multiple wait queues, we need to requeue
967 * waitlists in order. Generally speaking, we requeue the wait lists
968 * that are checked first.
969 *
970 * Here are the various wait lists, in the order they are used during
971 * request processing, with notes:
972 *
973 * - waiting_for_map
974 * - may start or stop blocking at any time (depending on client epoch)
975 * - waiting_for_peered
976 * - !is_peered() or flushes_in_progress
977 * - only starts blocking on interval change; never restarts
978 * - waiting_for_active
979 * - !is_active()
980 * - only starts blocking on interval change; never restarts
b32b8144
FG
981 * - waiting_for_flush
982 * - is_active() and flushes_in_progress
983 * - waiting for final flush during activate
7c673cae
FG
984 * - waiting_for_scrub
985 * - starts and stops blocking for varying intervals during scrub
986 * - waiting_for_unreadable_object
987 * - never restarts once object is readable (* except for EIO?)
988 * - waiting_for_degraded_object
989 * - never restarts once object is writeable (* except for EIO?)
990 * - waiting_for_blocked_object
991 * - starts and stops based on proxied op activity
992 * - obc rwlocks
993 * - starts and stops based on read/write activity
994 *
995 * Notes:
996 *
997 * 1. During and interval change, we requeue *everything* in the above order.
998 *
999 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1000 * the op there if it applies. We ignore the unreadable/degraded/blocked
1001 * queues because we assume they cannot apply at that time (this is
1002 * probably mostly true).
1003 *
1004 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1005 * it is non-empty.
1006 *
1007 * These three behaviors are generally sufficient to maintain ordering, with
1008 * the possible exception of cases where we make an object degraded or
1009 * unreadable that was previously okay, e.g. when scrub or op processing
1010 * encounter an unexpected error. FIXME.
1011 */
1012
1013 // pg waiters
1014 unsigned flushes_in_progress;
1015
1016 // ops with newer maps than our (or blocked behind them)
1017 // track these by client, since inter-request ordering doesn't otherwise
1018 // matter.
1019 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1020
1021 // ops waiting on peered
1022 list<OpRequestRef> waiting_for_peered;
1023
1024 // ops waiting on active (require peered as well)
1025 list<OpRequestRef> waiting_for_active;
b32b8144 1026 list<OpRequestRef> waiting_for_flush;
7c673cae
FG
1027 list<OpRequestRef> waiting_for_scrub;
1028
1029 list<OpRequestRef> waiting_for_cache_not_full;
224ce89b 1030 list<OpRequestRef> waiting_for_clean_to_primary_repair;
7c673cae
FG
1031 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1032 waiting_for_degraded_object,
1033 waiting_for_blocked_object;
1034
1035 set<hobject_t> objects_blocked_on_cache_full;
1036 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1037 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1038
1039 // Callbacks should assume pg (and nothing else) is locked
1040 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1041
1042 map<eversion_t,
1043 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
1044
1045 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1046 void requeue_op(OpRequestRef op);
1047 void requeue_ops(list<OpRequestRef> &l);
1048
1049 // stats that persist lazily
1050 object_stat_collection_t unstable_stats;
1051
1052 // publish stats
1053 Mutex pg_stats_publish_lock;
1054 bool pg_stats_publish_valid;
1055 pg_stat_t pg_stats_publish;
1056
1057 // for ordering writes
1058 ceph::shared_ptr<ObjectStore::Sequencer> osr;
1059
1060 void _update_calc_stats();
1061 void _update_blocked_by();
1062 void publish_stats_to_osd();
1063 void clear_publish_stats();
1064
1065public:
1066 void clear_primary_state();
1067
7c673cae
FG
1068 bool is_actingbackfill(pg_shard_t osd) const {
1069 return actingbackfill.count(osd);
1070 }
1071 bool is_acting(pg_shard_t osd) const {
1072 return has_shard(pool.info.ec_pool(), acting, osd);
1073 }
1074 bool is_up(pg_shard_t osd) const {
1075 return has_shard(pool.info.ec_pool(), up, osd);
1076 }
1077 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1078 if (ec) {
1079 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1080 } else {
1081 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1082 }
1083 }
1084
1085 bool needs_recovery() const;
1086 bool needs_backfill() const;
1087
c07f9fc5
FG
1088 /// clip calculated priority to reasonable range
1089 inline int clamp_recovery_priority(int priority);
7c673cae
FG
1090 /// get log recovery reservation priority
1091 unsigned get_recovery_priority();
1092 /// get backfill reservation priority
1093 unsigned get_backfill_priority();
1094
1095 void mark_clean(); ///< mark an active pg clean
d2e6a577 1096 void _change_recovery_force_mode(int new_mode, bool clear);
7c673cae
FG
1097
1098 /// return [start,end) bounds for required past_intervals
1099 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1100 const pg_info_t &info,
1101 epoch_t oldest_map) {
1102 epoch_t start = MAX(
1103 info.history.last_epoch_clean ? info.history.last_epoch_clean :
31f18b77 1104 info.history.epoch_pool_created,
7c673cae
FG
1105 oldest_map);
1106 epoch_t end = MAX(
1107 info.history.same_interval_since,
31f18b77 1108 info.history.epoch_pool_created);
7c673cae
FG
1109 return make_pair(start, end);
1110 }
1111 void check_past_interval_bounds() const;
1112 PastIntervals::PriorSet build_prior();
1113
1114 void remove_down_peer_info(const OSDMapRef osdmap);
1115
1116 bool adjust_need_up_thru(const OSDMapRef osdmap);
1117
1118 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1119 virtual void dump_recovery_info(Formatter *f) const = 0;
1120
1121 bool calc_min_last_complete_ondisk() {
1122 eversion_t min = last_complete_ondisk;
1123 assert(!actingbackfill.empty());
1124 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1125 i != actingbackfill.end();
1126 ++i) {
1127 if (*i == get_primary()) continue;
1128 if (peer_last_complete_ondisk.count(*i) == 0)
1129 return false; // we don't have complete info
1130 eversion_t a = peer_last_complete_ondisk[*i];
1131 if (a < min)
1132 min = a;
1133 }
1134 if (min == min_last_complete_ondisk)
1135 return false;
1136 min_last_complete_ondisk = min;
1137 return true;
1138 }
1139
1140 virtual void calc_trim_to() = 0;
1141
1142 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1143 pg_missing_t& omissing, pg_shard_t from);
1144 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1145 pg_missing_t& omissing, pg_shard_t from);
1146 bool proc_replica_info(
1147 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1148
1149 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1150 PG *pg;
1151 ObjectStore::Transaction *t;
1152 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1153
1154 // LogEntryHandler
1155 void remove(const hobject_t &hoid) override {
1156 pg->get_pgbackend()->remove(hoid, t);
1157 }
1158 void try_stash(const hobject_t &hoid, version_t v) override {
1159 pg->get_pgbackend()->try_stash(hoid, v, t);
1160 }
1161 void rollback(const pg_log_entry_t &entry) override {
1162 assert(entry.can_rollback());
1163 pg->get_pgbackend()->rollback(entry, t);
1164 }
1165 void rollforward(const pg_log_entry_t &entry) override {
1166 pg->get_pgbackend()->rollforward(entry, t);
1167 }
1168 void trim(const pg_log_entry_t &entry) override {
1169 pg->get_pgbackend()->trim(entry, t);
1170 }
1171 };
1172
1173 void update_object_snap_mapping(
1174 ObjectStore::Transaction *t, const hobject_t &soid,
1175 const set<snapid_t> &snaps);
1176 void clear_object_snap_mapping(
1177 ObjectStore::Transaction *t, const hobject_t &soid);
1178 void remove_snap_mapped_object(
1179 ObjectStore::Transaction& t, const hobject_t& soid);
1180 void merge_log(
1181 ObjectStore::Transaction& t, pg_info_t &oinfo,
1182 pg_log_t &olog, pg_shard_t from);
1183 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1184 bool search_for_missing(
1185 const pg_info_t &oinfo, const pg_missing_t &omissing,
1186 pg_shard_t fromosd,
1187 RecoveryCtx*);
1188
1189 void check_for_lost_objects();
1190 void forget_lost_objects();
1191
1192 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1193
1194 void trim_write_ahead();
1195
1196 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1197 const map<pg_shard_t, pg_info_t> &infos,
1198 bool restrict_to_up_acting,
1199 bool *history_les_bound) const;
1200 static void calc_ec_acting(
1201 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1202 unsigned size,
1203 const vector<int> &acting,
1204 pg_shard_t acting_primary,
1205 const vector<int> &up,
1206 pg_shard_t up_primary,
1207 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1208 bool restrict_to_up_acting,
1209 vector<int> *want,
1210 set<pg_shard_t> *backfill,
1211 set<pg_shard_t> *acting_backfill,
1212 pg_shard_t *want_primary,
1213 ostream &ss);
1214 static void calc_replicated_acting(
1215 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1216 unsigned size,
1217 const vector<int> &acting,
1218 pg_shard_t acting_primary,
1219 const vector<int> &up,
1220 pg_shard_t up_primary,
1221 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1222 bool restrict_to_up_acting,
1223 vector<int> *want,
1224 set<pg_shard_t> *backfill,
1225 set<pg_shard_t> *acting_backfill,
1226 pg_shard_t *want_primary,
1227 ostream &ss);
1228 bool choose_acting(pg_shard_t &auth_log_shard,
1229 bool restrict_to_up_acting,
1230 bool *history_les_bound);
1231 void build_might_have_unfound();
1232 void activate(
1233 ObjectStore::Transaction& t,
1234 epoch_t activation_epoch,
1235 list<Context*>& tfin,
1236 map<int, map<spg_t,pg_query_t> >& query_map,
1237 map<int,
1238 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1239 RecoveryCtx *ctx);
1240 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1241 void all_activated_and_committed();
1242
1243 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1244
1245 bool have_unfound() const {
1246 return missing_loc.have_unfound();
1247 }
1248 uint64_t get_num_unfound() const {
1249 return missing_loc.num_unfound();
1250 }
1251
1252 virtual void check_local() = 0;
1253
1254 /**
1255 * @param ops_begun returns how many recovery ops the function started
1256 * @returns true if any useful work was accomplished; false otherwise
1257 */
1258 virtual bool start_recovery_ops(
1259 uint64_t max,
1260 ThreadPool::TPHandle &handle,
1261 uint64_t *ops_begun) = 0;
1262
1263 void purge_strays();
1264
1265 void update_heartbeat_peers();
1266
1267 Context *finish_sync_event;
1268
1269 void finish_recovery(list<Context*>& tfin);
1270 void _finish_recovery(Context *c);
1271 void cancel_recovery();
1272 void clear_recovery_state();
1273 virtual void _clear_recovery_state() = 0;
1274 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1275 void start_recovery_op(const hobject_t& soid);
1276 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1277
1278 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1279 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1280
1281 friend class C_OSD_RepModify_Commit;
1282
1283 // -- backoff --
1284 Mutex backoff_lock; // orders inside Backoff::lock
1285 map<hobject_t,set<BackoffRef>> backoffs;
1286
1287 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1288 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1289 void release_backoffs(const hobject_t& o) {
1290 release_backoffs(o, o);
1291 }
1292 void clear_backoffs();
1293
1294 void add_pg_backoff(SessionRef s) {
1295 hobject_t begin = info.pgid.pgid.get_hobj_start();
1296 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1297 add_backoff(s, begin, end);
1298 }
1299 void release_pg_backoffs() {
1300 hobject_t begin = info.pgid.pgid.get_hobj_start();
1301 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1302 release_backoffs(begin, end);
1303 }
1304
1305 void rm_backoff(BackoffRef b);
1306
1307 // -- scrub --
1308 struct Scrubber {
1309 Scrubber();
1310 ~Scrubber();
1311
1312 // metadata
1313 set<pg_shard_t> reserved_peers;
1314 bool reserved, reserve_failed;
1315 epoch_t epoch_start;
1316
1317 // common to both scrubs
1318 bool active;
7c673cae
FG
1319 set<pg_shard_t> waiting_on_whom;
1320 int shallow_errors;
1321 int deep_errors;
28e407b8 1322 int large_omap_objects = 0;
7c673cae
FG
1323 int fixed;
1324 ScrubMap primary_scrubmap;
28e407b8
AA
1325 ScrubMapBuilder primary_scrubmap_pos;
1326 epoch_t replica_scrub_start = 0;
1327 ScrubMap replica_scrubmap;
1328 ScrubMapBuilder replica_scrubmap_pos;
7c673cae
FG
1329 map<pg_shard_t, ScrubMap> received_maps;
1330 OpRequestRef active_rep_scrub;
1331 utime_t scrub_reg_stamp; // stamp we registered for
1332
1333 // For async sleep
1334 bool sleeping = false;
1335 bool needs_sleep = true;
1336 utime_t sleep_start;
1337
1338 // flags to indicate explicitly requested scrubs (by admin)
1339 bool must_scrub, must_deep_scrub, must_repair;
1340
1341 // Priority to use for scrub scheduling
1342 unsigned priority;
1343
1344 // this flag indicates whether we would like to do auto-repair of the PG or not
1345 bool auto_repair;
1346
1347 // Maps from objects with errors to missing/inconsistent peers
1348 map<hobject_t, set<pg_shard_t>> missing;
1349 map<hobject_t, set<pg_shard_t>> inconsistent;
1350
1351 // Map from object with errors to good peers
1352 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1353
1354 // Cleaned map pending snap metadata scrub
1355 ScrubMap cleaned_meta_map;
1356
28e407b8
AA
1357 void clean_meta_map(ScrubMap &for_meta_scrub) {
1358 if (end.is_max() ||
1359 cleaned_meta_map.objects.empty()) {
1360 cleaned_meta_map.swap(for_meta_scrub);
1361 } else {
1362 auto iter = cleaned_meta_map.objects.end();
1363 --iter; // not empty, see if clause
1364 auto begin = cleaned_meta_map.objects.begin();
1365 if (iter->first.has_snapset()) {
1366 ++iter;
1367 } else {
1368 while (iter != begin) {
1369 auto next = iter--;
1370 if (next->first.get_head() != iter->first.get_head()) {
1371 ++iter;
1372 break;
1373 }
1374 }
1375 }
1376 for_meta_scrub.objects.insert(begin, iter);
1377 cleaned_meta_map.objects.erase(begin, iter);
1378 }
1379 }
1380
7c673cae
FG
1381 // digest updates which we are waiting on
1382 int num_digest_updates_pending;
1383
1384 // chunky scrub
28e407b8
AA
1385 hobject_t start, end; // [start,end)
1386 hobject_t max_end; // Largest end that may have been sent to replicas
7c673cae
FG
1387 eversion_t subset_last_update;
1388
1389 // chunky scrub state
1390 enum State {
1391 INACTIVE,
1392 NEW_CHUNK,
1393 WAIT_PUSHES,
1394 WAIT_LAST_UPDATE,
1395 BUILD_MAP,
28e407b8 1396 BUILD_MAP_DONE,
7c673cae
FG
1397 WAIT_REPLICAS,
1398 COMPARE_MAPS,
1399 WAIT_DIGEST_UPDATES,
1400 FINISH,
28e407b8 1401 BUILD_MAP_REPLICA,
7c673cae
FG
1402 } state;
1403
1404 std::unique_ptr<Scrub::Store> store;
1405 // deep scrub
1406 bool deep;
28e407b8
AA
1407 int preempt_left;
1408 int preempt_divisor;
7c673cae
FG
1409
1410 list<Context*> callbacks;
1411 void add_callback(Context *context) {
1412 callbacks.push_back(context);
1413 }
1414 void run_callbacks() {
1415 list<Context*> to_run;
1416 to_run.swap(callbacks);
1417 for (list<Context*>::iterator i = to_run.begin();
1418 i != to_run.end();
1419 ++i) {
1420 (*i)->complete(0);
1421 }
1422 }
1423
1424 static const char *state_string(const PG::Scrubber::State& state) {
1425 const char *ret = NULL;
1426 switch( state )
1427 {
1428 case INACTIVE: ret = "INACTIVE"; break;
1429 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1430 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1431 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1432 case BUILD_MAP: ret = "BUILD_MAP"; break;
28e407b8 1433 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
7c673cae
FG
1434 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1435 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1436 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1437 case FINISH: ret = "FINISH"; break;
28e407b8 1438 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
7c673cae
FG
1439 }
1440 return ret;
1441 }
1442
1443 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1444
7c673cae
FG
1445 // clear all state
1446 void reset() {
1447 active = false;
7c673cae
FG
1448 waiting_on_whom.clear();
1449 if (active_rep_scrub) {
1450 active_rep_scrub = OpRequestRef();
1451 }
1452 received_maps.clear();
1453
1454 must_scrub = false;
1455 must_deep_scrub = false;
1456 must_repair = false;
1457 auto_repair = false;
1458
1459 state = PG::Scrubber::INACTIVE;
1460 start = hobject_t();
1461 end = hobject_t();
28e407b8 1462 max_end = hobject_t();
7c673cae
FG
1463 subset_last_update = eversion_t();
1464 shallow_errors = 0;
1465 deep_errors = 0;
28e407b8 1466 large_omap_objects = 0;
7c673cae
FG
1467 fixed = 0;
1468 deep = false;
7c673cae
FG
1469 run_callbacks();
1470 inconsistent.clear();
1471 missing.clear();
1472 authoritative.clear();
1473 num_digest_updates_pending = 0;
28e407b8
AA
1474 primary_scrubmap = ScrubMap();
1475 primary_scrubmap_pos.reset();
1476 replica_scrubmap = ScrubMap();
1477 replica_scrubmap_pos.reset();
7c673cae
FG
1478 cleaned_meta_map = ScrubMap();
1479 sleeping = false;
1480 needs_sleep = true;
1481 sleep_start = utime_t();
1482 }
1483
1484 void create_results(const hobject_t& obj);
1485 void cleanup_store(ObjectStore::Transaction *t);
1486 } scrubber;
1487
1488 bool scrub_after_recovery;
1489
1490 int active_pushes;
1491
28e407b8
AA
1492 bool scrub_can_preempt = false;
1493 bool scrub_preempted = false;
1494
1495 // we allow some number of preemptions of the scrub, which mean we do
1496 // not block. then we start to block. once we start blocking, we do
1497 // not stop until the scrub range is completed.
1498 bool write_blocked_by_scrub(const hobject_t &soid);
1499
1500 /// true if the given range intersects the scrub interval in any way
1501 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1502
7c673cae
FG
1503 void repair_object(
1504 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1505 pg_shard_t bad_peer);
1506
1507 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1508 void chunky_scrub(ThreadPool::TPHandle &handle);
1509 void scrub_compare_maps();
1510 /**
1511 * return true if any inconsistency/missing is repaired, false otherwise
1512 */
1513 bool scrub_process_inconsistent();
31f18b77 1514 bool ops_blocked_by_scrub() const;
7c673cae
FG
1515 void scrub_finish();
1516 void scrub_clear_state();
1517 void _scan_snaps(ScrubMap &map);
224ce89b 1518 void _repair_oinfo_oid(ScrubMap &map);
7c673cae
FG
1519 void _scan_rollback_obs(
1520 const vector<ghobject_t> &rollback_obs,
1521 ThreadPool::TPHandle &handle);
1522 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1523 hobject_t start, hobject_t end, bool deep,
28e407b8 1524 bool allow_preemption);
7c673cae
FG
1525 int build_scrub_map_chunk(
1526 ScrubMap &map,
28e407b8
AA
1527 ScrubMapBuilder &pos,
1528 hobject_t start, hobject_t end, bool deep,
7c673cae
FG
1529 ThreadPool::TPHandle &handle);
1530 /**
1531 * returns true if [begin, end) is good to scrub at this time
1532 * a false return value obliges the implementer to requeue scrub when the
1533 * condition preventing scrub clears
1534 */
1535 virtual bool _range_available_for_scrub(
1536 const hobject_t &begin, const hobject_t &end) = 0;
1537 virtual void scrub_snapshot_metadata(
1538 ScrubMap &map,
28e407b8
AA
1539 const std::map<hobject_t,
1540 pair<boost::optional<uint32_t>,
1541 boost::optional<uint32_t>>> &missing_digest) { }
7c673cae
FG
1542 virtual void _scrub_clear_state() { }
1543 virtual void _scrub_finish() { }
1544 virtual void split_colls(
1545 spg_t child,
1546 int split_bits,
1547 int seed,
1548 const pg_pool_t *pool,
1549 ObjectStore::Transaction *t) = 0;
1550 void clear_scrub_reserved();
1551 void scrub_reserve_replicas();
1552 void scrub_unreserve_replicas();
1553 bool scrub_all_replicas_reserved() const;
1554 bool sched_scrub();
1555 void reg_next_scrub();
1556 void unreg_next_scrub();
1557
1558 void replica_scrub(
1559 OpRequestRef op,
1560 ThreadPool::TPHandle &handle);
1561 void do_replica_scrub_map(OpRequestRef op);
1562 void sub_op_scrub_map(OpRequestRef op);
1563
1564 void handle_scrub_reserve_request(OpRequestRef op);
1565 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1566 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1567 void handle_scrub_reserve_release(OpRequestRef op);
1568
1569 void reject_reservation();
3efd9988
FG
1570 void schedule_backfill_retry(float retry);
1571 void schedule_recovery_retry(float retry);
7c673cae
FG
1572
1573 // -- recovery state --
1574
1575 template <class EVT>
1576 struct QueuePeeringEvt : Context {
1577 PGRef pg;
1578 epoch_t epoch;
1579 EVT evt;
1580 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1581 pg(pg), epoch(epoch), evt(evt) {}
1582 void finish(int r) override {
1583 pg->lock();
1584 pg->queue_peering_event(PG::CephPeeringEvtRef(
1585 new PG::CephPeeringEvt(
1586 epoch,
1587 epoch,
1588 evt)));
1589 pg->unlock();
1590 }
1591 };
1592
1593 class CephPeeringEvt {
1594 epoch_t epoch_sent;
1595 epoch_t epoch_requested;
1596 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1597 string desc;
1598 public:
1599 MEMPOOL_CLASS_HELPERS();
1600 template <class T>
1601 CephPeeringEvt(epoch_t epoch_sent,
1602 epoch_t epoch_requested,
1603 const T &evt_) :
1604 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1605 evt(evt_.intrusive_from_this()) {
1606 stringstream out;
1607 out << "epoch_sent: " << epoch_sent
1608 << " epoch_requested: " << epoch_requested << " ";
1609 evt_.print(&out);
1610 desc = out.str();
1611 }
1612 epoch_t get_epoch_sent() { return epoch_sent; }
1613 epoch_t get_epoch_requested() { return epoch_requested; }
1614 const boost::statechart::event_base &get_event() { return *evt; }
1615 string get_desc() { return desc; }
1616 };
1617 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1618 list<CephPeeringEvtRef> peering_queue; // op queue
1619 list<CephPeeringEvtRef> peering_waiters;
1620
1621 struct QueryState : boost::statechart::event< QueryState > {
1622 Formatter *f;
1623 explicit QueryState(Formatter *f) : f(f) {}
1624 void print(std::ostream *out) const {
1625 *out << "Query";
1626 }
1627 };
1628
1629 struct MInfoRec : boost::statechart::event< MInfoRec > {
1630 pg_shard_t from;
1631 pg_info_t info;
1632 epoch_t msg_epoch;
1633 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1634 from(from), info(info), msg_epoch(msg_epoch) {}
1635 void print(std::ostream *out) const {
1636 *out << "MInfoRec from " << from << " info: " << info;
1637 }
1638 };
1639
1640 struct MLogRec : boost::statechart::event< MLogRec > {
1641 pg_shard_t from;
1642 boost::intrusive_ptr<MOSDPGLog> msg;
1643 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1644 from(from), msg(msg) {}
1645 void print(std::ostream *out) const {
1646 *out << "MLogRec from " << from;
1647 }
1648 };
1649
1650 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1651 pg_shard_t from;
1652 pg_notify_t notify;
1653 uint64_t features;
1654 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1655 from(from), notify(notify), features(f) {}
1656 void print(std::ostream *out) const {
1657 *out << "MNotifyRec from " << from << " notify: " << notify
1658 << " features: 0x" << hex << features << dec;
1659 }
1660 };
1661
1662 struct MQuery : boost::statechart::event< MQuery > {
1663 pg_shard_t from;
1664 pg_query_t query;
1665 epoch_t query_epoch;
1666 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1667 from(from), query(query), query_epoch(query_epoch) {}
1668 void print(std::ostream *out) const {
1669 *out << "MQuery from " << from
1670 << " query_epoch " << query_epoch
1671 << " query: " << query;
1672 }
1673 };
1674
1675 struct AdvMap : boost::statechart::event< AdvMap > {
1676 OSDMapRef osdmap;
1677 OSDMapRef lastmap;
1678 vector<int> newup, newacting;
1679 int up_primary, acting_primary;
1680 AdvMap(
1681 OSDMapRef osdmap, OSDMapRef lastmap,
1682 vector<int>& newup, int up_primary,
1683 vector<int>& newacting, int acting_primary):
1684 osdmap(osdmap), lastmap(lastmap),
1685 newup(newup),
1686 newacting(newacting),
1687 up_primary(up_primary),
1688 acting_primary(acting_primary) {}
1689 void print(std::ostream *out) const {
1690 *out << "AdvMap";
1691 }
1692 };
1693
1694 struct ActMap : boost::statechart::event< ActMap > {
1695 ActMap() : boost::statechart::event< ActMap >() {}
1696 void print(std::ostream *out) const {
1697 *out << "ActMap";
1698 }
1699 };
1700 struct Activate : boost::statechart::event< Activate > {
1701 epoch_t activation_epoch;
1702 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1703 activation_epoch(q) {}
1704 void print(std::ostream *out) const {
1705 *out << "Activate from " << activation_epoch;
1706 }
1707 };
1708 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1709 unsigned priority;
1710 explicit RequestBackfillPrio(unsigned prio) :
1711 boost::statechart::event< RequestBackfillPrio >(),
1712 priority(prio) {}
1713 void print(std::ostream *out) const {
1714 *out << "RequestBackfillPrio: priority " << priority;
1715 }
1716 };
1717#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1718 T() : boost::statechart::event< T >() {} \
1719 void print(std::ostream *out) const { \
1720 *out << #T; \
1721 } \
1722 };
3efd9988
FG
1723 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1724 float delay;
1725 explicit DeferBackfill(float delay) : delay(delay) {}
1726 void print(std::ostream *out) const {
1727 *out << "DeferBackfill: delay " << delay;
1728 }
1729 };
1730 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1731 float delay;
1732 explicit DeferRecovery(float delay) : delay(delay) {}
1733 void print(std::ostream *out) const {
1734 *out << "DeferRecovery: delay " << delay;
1735 }
1736 };
b32b8144
FG
1737 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1738 explicit UnfoundBackfill() {}
1739 void print(std::ostream *out) const {
1740 *out << "UnfoundBackfill";
1741 }
1742 };
1743 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1744 explicit UnfoundRecovery() {}
1745 void print(std::ostream *out) const {
1746 *out << "UnfoundRecovery";
1747 }
1748 };
1749protected:
7c673cae
FG
1750 TrivialEvent(Initialize)
1751 TrivialEvent(Load)
1752 TrivialEvent(GotInfo)
1753 TrivialEvent(NeedUpThru)
1754 TrivialEvent(NullEvt)
1755 TrivialEvent(FlushedEvt)
1756 TrivialEvent(Backfilled)
1757 TrivialEvent(LocalBackfillReserved)
1758 TrivialEvent(RemoteBackfillReserved)
3efd9988 1759 TrivialEvent(RejectRemoteReservation)
7c673cae 1760 TrivialEvent(RemoteReservationRejected)
3efd9988 1761 TrivialEvent(RemoteReservationCanceled)
7c673cae
FG
1762 TrivialEvent(RequestBackfill)
1763 TrivialEvent(RequestRecovery)
1764 TrivialEvent(RecoveryDone)
1765 TrivialEvent(BackfillTooFull)
1766 TrivialEvent(RecoveryTooFull)
3efd9988
FG
1767
1768 TrivialEvent(MakePrimary)
1769 TrivialEvent(MakeStray)
1770 TrivialEvent(NeedActingChange)
1771 TrivialEvent(IsIncomplete)
1772 TrivialEvent(IsDown)
7c673cae
FG
1773
1774 TrivialEvent(AllReplicasRecovered)
1775 TrivialEvent(DoRecovery)
1776 TrivialEvent(LocalRecoveryReserved)
1777 TrivialEvent(RemoteRecoveryReserved)
1778 TrivialEvent(AllRemotesReserved)
1779 TrivialEvent(AllBackfillsReserved)
1780 TrivialEvent(GoClean)
1781
1782 TrivialEvent(AllReplicasActivated)
1783
1784 TrivialEvent(IntervalFlush)
1785
1786 /* Encapsulates PG recovery process */
1787 class RecoveryState {
1788 void start_handle(RecoveryCtx *new_ctx);
1789 void end_handle();
1790 public:
1791 void begin_block_outgoing();
1792 void end_block_outgoing();
1793 void clear_blocked_outgoing();
1794 private:
1795
1796 /* States */
1797 struct Initial;
1798 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1799 RecoveryState *state;
1800 public:
1801 PG *pg;
1802
1803 utime_t event_time;
1804 uint64_t event_count;
1805
1806 void clear_event_counters() {
1807 event_time = utime_t();
1808 event_count = 0;
1809 }
1810
1811 void log_enter(const char *state_name);
1812 void log_exit(const char *state_name, utime_t duration);
1813
1814 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1815
1816 /* Accessor functions for state methods */
1817 ObjectStore::Transaction* get_cur_transaction() {
1818 assert(state->rctx);
1819 assert(state->rctx->transaction);
1820 return state->rctx->transaction;
1821 }
1822
1823 void send_query(pg_shard_t to, const pg_query_t &query) {
1824 assert(state->rctx);
1825 assert(state->rctx->query_map);
1826 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1827 query;
1828 }
1829
1830 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1831 assert(state->rctx);
1832 assert(state->rctx->query_map);
1833 return state->rctx->query_map;
1834 }
1835
1836 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1837 assert(state->rctx);
1838 assert(state->rctx->info_map);
1839 return state->rctx->info_map;
1840 }
1841
1842 list< Context* > *get_on_safe_context_list() {
1843 assert(state->rctx);
1844 assert(state->rctx->on_safe);
1845 return &(state->rctx->on_safe->contexts);
1846 }
1847
1848 list< Context * > *get_on_applied_context_list() {
1849 assert(state->rctx);
1850 assert(state->rctx->on_applied);
1851 return &(state->rctx->on_applied->contexts);
1852 }
1853
1854 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1855
1856 void send_notify(pg_shard_t to,
1857 const pg_notify_t &info, const PastIntervals &pi) {
1858 assert(state->rctx);
1adf2230 1859 state->rctx->send_notify(to, info, pi);
7c673cae
FG
1860 }
1861 };
1862 friend class RecoveryMachine;
1863
1864 /* States */
b32b8144
FG
1865 // Initial
1866 // Reset
1867 // Start
1868 // Started
1869 // Primary
1870 // WaitActingChange
1871 // Peering
1872 // GetInfo
1873 // GetLog
1874 // GetMissing
1875 // WaitUpThru
1876 // Incomplete
1877 // Active
1878 // Activating
1879 // Clean
1880 // Recovered
1881 // Backfilling
1882 // WaitRemoteBackfillReserved
1883 // WaitLocalBackfillReserved
1884 // NotBackfilling
1885 // NotRecovering
1886 // Recovering
1887 // WaitRemoteRecoveryReserved
1888 // WaitLocalRecoveryReserved
1889 // ReplicaActive
1890 // RepNotRecovering
1891 // RepRecovering
1892 // RepWaitBackfillReserved
1893 // RepWaitRecoveryReserved
1894 // Stray
7c673cae
FG
1895
1896 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1897 explicit Crashed(my_context ctx);
1898 };
1899
1900 struct Reset;
1901
1902 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1903 explicit Initial(my_context ctx);
1904 void exit();
1905
1906 typedef boost::mpl::list <
1907 boost::statechart::transition< Initialize, Reset >,
1908 boost::statechart::custom_reaction< Load >,
1909 boost::statechart::custom_reaction< NullEvt >,
1910 boost::statechart::transition< boost::statechart::event_base, Crashed >
1911 > reactions;
1912
1913 boost::statechart::result react(const Load&);
1914 boost::statechart::result react(const MNotifyRec&);
1915 boost::statechart::result react(const MInfoRec&);
1916 boost::statechart::result react(const MLogRec&);
1917 boost::statechart::result react(const boost::statechart::event_base&) {
1918 return discard_event();
1919 }
1920 };
1921
1922 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1923 explicit Reset(my_context ctx);
1924 void exit();
1925
1926 typedef boost::mpl::list <
1927 boost::statechart::custom_reaction< QueryState >,
1928 boost::statechart::custom_reaction< AdvMap >,
1929 boost::statechart::custom_reaction< ActMap >,
1930 boost::statechart::custom_reaction< NullEvt >,
1931 boost::statechart::custom_reaction< FlushedEvt >,
1932 boost::statechart::custom_reaction< IntervalFlush >,
1933 boost::statechart::transition< boost::statechart::event_base, Crashed >
1934 > reactions;
1935 boost::statechart::result react(const QueryState& q);
1936 boost::statechart::result react(const AdvMap&);
1937 boost::statechart::result react(const ActMap&);
1938 boost::statechart::result react(const FlushedEvt&);
1939 boost::statechart::result react(const IntervalFlush&);
1940 boost::statechart::result react(const boost::statechart::event_base&) {
1941 return discard_event();
1942 }
1943 };
1944
1945 struct Start;
1946
1947 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1948 explicit Started(my_context ctx);
1949 void exit();
1950
1951 typedef boost::mpl::list <
1952 boost::statechart::custom_reaction< QueryState >,
1953 boost::statechart::custom_reaction< AdvMap >,
1954 boost::statechart::custom_reaction< NullEvt >,
1955 boost::statechart::custom_reaction< FlushedEvt >,
1956 boost::statechart::custom_reaction< IntervalFlush >,
1957 boost::statechart::transition< boost::statechart::event_base, Crashed >
1958 > reactions;
1959 boost::statechart::result react(const QueryState& q);
1960 boost::statechart::result react(const AdvMap&);
1961 boost::statechart::result react(const FlushedEvt&);
1962 boost::statechart::result react(const IntervalFlush&);
1963 boost::statechart::result react(const boost::statechart::event_base&) {
1964 return discard_event();
1965 }
1966 };
1967
7c673cae
FG
1968 struct Primary;
1969 struct Stray;
1970
1971 struct Start : boost::statechart::state< Start, Started >, NamedState {
1972 explicit Start(my_context ctx);
1973 void exit();
1974
1975 typedef boost::mpl::list <
1976 boost::statechart::transition< MakePrimary, Primary >,
1977 boost::statechart::transition< MakeStray, Stray >
1978 > reactions;
1979 };
1980
1981 struct Peering;
1982 struct WaitActingChange;
7c673cae 1983 struct Incomplete;
7c673cae 1984 struct Down;
7c673cae
FG
1985
1986 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1987 explicit Primary(my_context ctx);
1988 void exit();
1989
1990 typedef boost::mpl::list <
1991 boost::statechart::custom_reaction< ActMap >,
1992 boost::statechart::custom_reaction< MNotifyRec >,
1993 boost::statechart::transition< NeedActingChange, WaitActingChange >
1994 > reactions;
1995 boost::statechart::result react(const ActMap&);
1996 boost::statechart::result react(const MNotifyRec&);
1997 };
1998
1999 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2000 NamedState {
2001 typedef boost::mpl::list <
2002 boost::statechart::custom_reaction< QueryState >,
2003 boost::statechart::custom_reaction< AdvMap >,
2004 boost::statechart::custom_reaction< MLogRec >,
2005 boost::statechart::custom_reaction< MInfoRec >,
2006 boost::statechart::custom_reaction< MNotifyRec >
2007 > reactions;
2008 explicit WaitActingChange(my_context ctx);
2009 boost::statechart::result react(const QueryState& q);
2010 boost::statechart::result react(const AdvMap&);
2011 boost::statechart::result react(const MLogRec&);
2012 boost::statechart::result react(const MInfoRec&);
2013 boost::statechart::result react(const MNotifyRec&);
2014 void exit();
2015 };
2016
2017 struct GetInfo;
2018 struct Active;
2019
2020 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2021 PastIntervals::PriorSet prior_set;
2022 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2023
2024 explicit Peering(my_context ctx);
2025 void exit();
2026
2027 typedef boost::mpl::list <
2028 boost::statechart::custom_reaction< QueryState >,
2029 boost::statechart::transition< Activate, Active >,
2030 boost::statechart::custom_reaction< AdvMap >
2031 > reactions;
2032 boost::statechart::result react(const QueryState& q);
2033 boost::statechart::result react(const AdvMap &advmap);
2034 };
2035
2036 struct WaitLocalRecoveryReserved;
2037 struct Activating;
2038 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2039 explicit Active(my_context ctx);
2040 void exit();
2041
2042 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2043 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2044 bool all_replicas_activated;
2045
2046 typedef boost::mpl::list <
2047 boost::statechart::custom_reaction< QueryState >,
2048 boost::statechart::custom_reaction< ActMap >,
2049 boost::statechart::custom_reaction< AdvMap >,
2050 boost::statechart::custom_reaction< MInfoRec >,
2051 boost::statechart::custom_reaction< MNotifyRec >,
2052 boost::statechart::custom_reaction< MLogRec >,
2053 boost::statechart::custom_reaction< Backfilled >,
3efd9988
FG
2054 boost::statechart::custom_reaction< AllReplicasActivated >,
2055 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
2056 boost::statechart::custom_reaction< DeferBackfill >,
2057 boost::statechart::custom_reaction< UnfoundRecovery >,
2058 boost::statechart::custom_reaction< UnfoundBackfill >,
2059 boost::statechart::custom_reaction< DoRecovery>
7c673cae
FG
2060 > reactions;
2061 boost::statechart::result react(const QueryState& q);
2062 boost::statechart::result react(const ActMap&);
2063 boost::statechart::result react(const AdvMap&);
2064 boost::statechart::result react(const MInfoRec& infoevt);
2065 boost::statechart::result react(const MNotifyRec& notevt);
2066 boost::statechart::result react(const MLogRec& logevt);
2067 boost::statechart::result react(const Backfilled&) {
2068 return discard_event();
2069 }
2070 boost::statechart::result react(const AllReplicasActivated&);
3efd9988
FG
2071 boost::statechart::result react(const DeferRecovery& evt) {
2072 return discard_event();
2073 }
2074 boost::statechart::result react(const DeferBackfill& evt) {
2075 return discard_event();
2076 }
b32b8144
FG
2077 boost::statechart::result react(const UnfoundRecovery& evt) {
2078 return discard_event();
2079 }
2080 boost::statechart::result react(const UnfoundBackfill& evt) {
2081 return discard_event();
2082 }
2083 boost::statechart::result react(const DoRecovery&) {
2084 return discard_event();
2085 }
7c673cae
FG
2086 };
2087
2088 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2089 typedef boost::mpl::list<
2090 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
2091 > reactions;
2092 explicit Clean(my_context ctx);
2093 void exit();
2094 };
2095
2096 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2097 typedef boost::mpl::list<
2098 boost::statechart::transition< GoClean, Clean >,
224ce89b 2099 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
7c673cae
FG
2100 boost::statechart::custom_reaction< AllReplicasActivated >
2101 > reactions;
2102 explicit Recovered(my_context ctx);
2103 void exit();
2104 boost::statechart::result react(const AllReplicasActivated&) {
2105 post_event(GoClean());
2106 return forward_event();
2107 }
2108 };
2109
2110 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2111 typedef boost::mpl::list<
2112 boost::statechart::transition< Backfilled, Recovered >,
3efd9988 2113 boost::statechart::custom_reaction< DeferBackfill >,
b32b8144 2114 boost::statechart::custom_reaction< UnfoundBackfill >,
7c673cae
FG
2115 boost::statechart::custom_reaction< RemoteReservationRejected >
2116 > reactions;
2117 explicit Backfilling(my_context ctx);
2118 boost::statechart::result react(const RemoteReservationRejected& evt);
3efd9988 2119 boost::statechart::result react(const DeferBackfill& evt);
b32b8144 2120 boost::statechart::result react(const UnfoundBackfill& evt);
7c673cae
FG
2121 void exit();
2122 };
2123
2124 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2125 typedef boost::mpl::list<
2126 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2127 boost::statechart::custom_reaction< RemoteReservationRejected >,
2128 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2129 > reactions;
2130 set<pg_shard_t>::const_iterator backfill_osd_it;
2131 explicit WaitRemoteBackfillReserved(my_context ctx);
2132 void exit();
2133 boost::statechart::result react(const RemoteBackfillReserved& evt);
2134 boost::statechart::result react(const RemoteReservationRejected& evt);
2135 };
2136
2137 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2138 typedef boost::mpl::list<
2139 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2140 > reactions;
2141 explicit WaitLocalBackfillReserved(my_context ctx);
2142 void exit();
2143 };
2144
2145 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2146 typedef boost::mpl::list<
2147 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2148 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2149 boost::statechart::custom_reaction< RemoteReservationRejected >
2150 > reactions;
2151 explicit NotBackfilling(my_context ctx);
2152 void exit();
2153 boost::statechart::result react(const RemoteBackfillReserved& evt);
2154 boost::statechart::result react(const RemoteReservationRejected& evt);
2155 };
2156
2157 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2158 typedef boost::mpl::list<
c07f9fc5 2159 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
b32b8144
FG
2160 boost::statechart::custom_reaction< DeferRecovery >,
2161 boost::statechart::custom_reaction< UnfoundRecovery >
7c673cae
FG
2162 > reactions;
2163 explicit NotRecovering(my_context ctx);
3efd9988 2164 boost::statechart::result react(const DeferRecovery& evt) {
c07f9fc5
FG
2165 /* no-op */
2166 return discard_event();
2167 }
b32b8144
FG
2168 boost::statechart::result react(const UnfoundRecovery& evt) {
2169 /* no-op */
2170 return discard_event();
2171 }
7c673cae
FG
2172 void exit();
2173 };
2174
2175 struct RepNotRecovering;
2176 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2177 explicit ReplicaActive(my_context ctx);
2178 void exit();
2179
2180 typedef boost::mpl::list <
2181 boost::statechart::custom_reaction< QueryState >,
2182 boost::statechart::custom_reaction< ActMap >,
2183 boost::statechart::custom_reaction< MQuery >,
2184 boost::statechart::custom_reaction< MInfoRec >,
2185 boost::statechart::custom_reaction< MLogRec >,
3efd9988
FG
2186 boost::statechart::custom_reaction< Activate >,
2187 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
2188 boost::statechart::custom_reaction< DeferBackfill >,
2189 boost::statechart::custom_reaction< UnfoundRecovery >,
2190 boost::statechart::custom_reaction< UnfoundBackfill >
7c673cae
FG
2191 > reactions;
2192 boost::statechart::result react(const QueryState& q);
2193 boost::statechart::result react(const MInfoRec& infoevt);
2194 boost::statechart::result react(const MLogRec& logevt);
2195 boost::statechart::result react(const ActMap&);
2196 boost::statechart::result react(const MQuery&);
2197 boost::statechart::result react(const Activate&);
3efd9988
FG
2198 boost::statechart::result react(const DeferRecovery& evt) {
2199 return discard_event();
2200 }
2201 boost::statechart::result react(const DeferBackfill& evt) {
2202 return discard_event();
2203 }
b32b8144
FG
2204 boost::statechart::result react(const UnfoundRecovery& evt) {
2205 return discard_event();
2206 }
2207 boost::statechart::result react(const UnfoundBackfill& evt) {
2208 return discard_event();
2209 }
7c673cae
FG
2210 };
2211
2212 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2213 typedef boost::mpl::list<
2214 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
3efd9988 2215 // for compat with old peers
7c673cae 2216 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
3efd9988 2217 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
7c673cae
FG
2218 boost::statechart::custom_reaction< BackfillTooFull >
2219 > reactions;
2220 explicit RepRecovering(my_context ctx);
2221 boost::statechart::result react(const BackfillTooFull &evt);
2222 void exit();
2223 };
2224
2225 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2226 typedef boost::mpl::list<
2227 boost::statechart::custom_reaction< RemoteBackfillReserved >,
3efd9988
FG
2228 boost::statechart::custom_reaction< RejectRemoteReservation >,
2229 boost::statechart::custom_reaction< RemoteReservationRejected >,
2230 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2231 > reactions;
2232 explicit RepWaitBackfillReserved(my_context ctx);
2233 void exit();
2234 boost::statechart::result react(const RemoteBackfillReserved &evt);
3efd9988 2235 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae 2236 boost::statechart::result react(const RemoteReservationRejected &evt);
3efd9988 2237 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2238 };
2239
2240 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2241 typedef boost::mpl::list<
3efd9988
FG
2242 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2243 // for compat with old peers
2244 boost::statechart::custom_reaction< RemoteReservationRejected >,
2245 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2246 > reactions;
2247 explicit RepWaitRecoveryReserved(my_context ctx);
2248 void exit();
2249 boost::statechart::result react(const RemoteRecoveryReserved &evt);
3efd9988
FG
2250 boost::statechart::result react(const RemoteReservationRejected &evt) {
2251 // for compat with old peers
2252 post_event(RemoteReservationCanceled());
2253 return discard_event();
2254 }
2255 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2256 };
2257
2258 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2259 typedef boost::mpl::list<
2260 boost::statechart::custom_reaction< RequestBackfillPrio >,
2261 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
3efd9988
FG
2262 boost::statechart::custom_reaction< RejectRemoteReservation >,
2263 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2264 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
7c673cae
FG
2265 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2266 > reactions;
2267 explicit RepNotRecovering(my_context ctx);
2268 boost::statechart::result react(const RequestBackfillPrio &evt);
3efd9988 2269 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae
FG
2270 void exit();
2271 };
2272
2273 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2274 typedef boost::mpl::list <
2275 boost::statechart::custom_reaction< AllReplicasRecovered >,
3efd9988 2276 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2277 boost::statechart::custom_reaction< UnfoundRecovery >,
7c673cae
FG
2278 boost::statechart::custom_reaction< RequestBackfill >
2279 > reactions;
2280 explicit Recovering(my_context ctx);
2281 void exit();
224ce89b 2282 void release_reservations(bool cancel = false);
7c673cae 2283 boost::statechart::result react(const AllReplicasRecovered &evt);
3efd9988 2284 boost::statechart::result react(const DeferRecovery& evt);
b32b8144 2285 boost::statechart::result react(const UnfoundRecovery& evt);
7c673cae
FG
2286 boost::statechart::result react(const RequestBackfill &evt);
2287 };
2288
2289 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2290 typedef boost::mpl::list <
2291 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2292 boost::statechart::transition< AllRemotesReserved, Recovering >
2293 > reactions;
2294 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2295 explicit WaitRemoteRecoveryReserved(my_context ctx);
2296 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2297 void exit();
2298 };
2299
2300 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2301 typedef boost::mpl::list <
2302 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2303 boost::statechart::custom_reaction< RecoveryTooFull >
2304 > reactions;
2305 explicit WaitLocalRecoveryReserved(my_context ctx);
2306 void exit();
2307 boost::statechart::result react(const RecoveryTooFull &evt);
2308 };
2309
2310 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2311 typedef boost::mpl::list <
2312 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2313 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2314 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2315 > reactions;
2316 explicit Activating(my_context ctx);
2317 void exit();
2318 };
2319
2320 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2321 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2322
2323 explicit Stray(my_context ctx);
2324 void exit();
2325
2326 typedef boost::mpl::list <
2327 boost::statechart::custom_reaction< MQuery >,
2328 boost::statechart::custom_reaction< MLogRec >,
2329 boost::statechart::custom_reaction< MInfoRec >,
2330 boost::statechart::custom_reaction< ActMap >,
2331 boost::statechart::custom_reaction< RecoveryDone >
2332 > reactions;
2333 boost::statechart::result react(const MQuery& query);
2334 boost::statechart::result react(const MLogRec& logevt);
2335 boost::statechart::result react(const MInfoRec& infoevt);
2336 boost::statechart::result react(const ActMap&);
2337 boost::statechart::result react(const RecoveryDone&) {
2338 return discard_event();
2339 }
2340 };
2341
2342 struct GetLog;
2343
2344 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2345 set<pg_shard_t> peer_info_requested;
2346
2347 explicit GetInfo(my_context ctx);
2348 void exit();
2349 void get_infos();
2350
2351 typedef boost::mpl::list <
2352 boost::statechart::custom_reaction< QueryState >,
2353 boost::statechart::transition< GotInfo, GetLog >,
2354 boost::statechart::custom_reaction< MNotifyRec >,
2355 boost::statechart::transition< IsDown, Down >
2356 > reactions;
2357 boost::statechart::result react(const QueryState& q);
2358 boost::statechart::result react(const MNotifyRec& infoevt);
2359 };
2360
2361 struct GotLog : boost::statechart::event< GotLog > {
2362 GotLog() : boost::statechart::event< GotLog >() {}
2363 };
2364
2365 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2366 pg_shard_t auth_log_shard;
2367 boost::intrusive_ptr<MOSDPGLog> msg;
2368
2369 explicit GetLog(my_context ctx);
2370 void exit();
2371
2372 typedef boost::mpl::list <
2373 boost::statechart::custom_reaction< QueryState >,
2374 boost::statechart::custom_reaction< MLogRec >,
2375 boost::statechart::custom_reaction< GotLog >,
2376 boost::statechart::custom_reaction< AdvMap >,
2377 boost::statechart::transition< IsIncomplete, Incomplete >
2378 > reactions;
2379 boost::statechart::result react(const AdvMap&);
2380 boost::statechart::result react(const QueryState& q);
2381 boost::statechart::result react(const MLogRec& logevt);
2382 boost::statechart::result react(const GotLog&);
2383 };
2384
2385 struct WaitUpThru;
2386
2387 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2388 set<pg_shard_t> peer_missing_requested;
2389
2390 explicit GetMissing(my_context ctx);
2391 void exit();
2392
2393 typedef boost::mpl::list <
2394 boost::statechart::custom_reaction< QueryState >,
2395 boost::statechart::custom_reaction< MLogRec >,
2396 boost::statechart::transition< NeedUpThru, WaitUpThru >
2397 > reactions;
2398 boost::statechart::result react(const QueryState& q);
2399 boost::statechart::result react(const MLogRec& logevt);
2400 };
2401
2402 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2403 explicit WaitUpThru(my_context ctx);
2404 void exit();
2405
2406 typedef boost::mpl::list <
2407 boost::statechart::custom_reaction< QueryState >,
2408 boost::statechart::custom_reaction< ActMap >,
2409 boost::statechart::custom_reaction< MLogRec >
2410 > reactions;
2411 boost::statechart::result react(const QueryState& q);
2412 boost::statechart::result react(const ActMap& am);
2413 boost::statechart::result react(const MLogRec& logrec);
2414 };
2415
2416 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2417 explicit Down(my_context ctx);
2418 typedef boost::mpl::list <
2419 boost::statechart::custom_reaction< QueryState >
2420 > reactions;
2421 boost::statechart::result react(const QueryState& infoevt);
2422 void exit();
2423 };
2424
2425 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2426 typedef boost::mpl::list <
2427 boost::statechart::custom_reaction< AdvMap >,
2428 boost::statechart::custom_reaction< MNotifyRec >,
2429 boost::statechart::custom_reaction< QueryState >
2430 > reactions;
2431 explicit Incomplete(my_context ctx);
2432 boost::statechart::result react(const AdvMap &advmap);
2433 boost::statechart::result react(const MNotifyRec& infoevt);
2434 boost::statechart::result react(const QueryState& infoevt);
2435 void exit();
2436 };
2437
2438
2439 RecoveryMachine machine;
2440 PG *pg;
2441
2442 /// context passed in by state machine caller
2443 RecoveryCtx *orig_ctx;
2444
2445 /// populated if we are buffering messages pending a flush
2446 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2447
2448 /**
2449 * populated between start_handle() and end_handle(), points into
2450 * the message lists for messages_pending_flush while blocking messages
2451 * or into orig_ctx otherwise
2452 */
2453 boost::optional<RecoveryCtx> rctx;
2454
2455 public:
2456 explicit RecoveryState(PG *pg)
2457 : machine(this, pg), pg(pg), orig_ctx(0) {
2458 machine.initiate();
2459 }
2460
2461 void handle_event(const boost::statechart::event_base &evt,
2462 RecoveryCtx *rctx) {
2463 start_handle(rctx);
2464 machine.process_event(evt);
2465 end_handle();
2466 }
2467
2468 void handle_event(CephPeeringEvtRef evt,
2469 RecoveryCtx *rctx) {
2470 start_handle(rctx);
2471 machine.process_event(evt->get_event());
2472 end_handle();
2473 }
2474
2475 } recovery_state;
2476
2477
2478 public:
2479 PG(OSDService *o, OSDMapRef curmap,
2480 const PGPool &pool, spg_t p);
2481 ~PG() override;
2482
2483 private:
2484 // Prevent copying
2485 explicit PG(const PG& rhs);
2486 PG& operator=(const PG& rhs);
2487 const spg_t pg_id;
2488 uint64_t peer_features;
2489 uint64_t acting_features;
2490 uint64_t upacting_features;
2491
2492 epoch_t last_epoch;
2493
7c673cae
FG
2494 public:
2495 const spg_t& get_pgid() const { return pg_id; }
2496
2497 void reset_min_peer_features() {
2498 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2499 }
2500 uint64_t get_min_peer_features() const { return peer_features; }
2501 void apply_peer_features(uint64_t f) { peer_features &= f; }
2502
2503 uint64_t get_min_acting_features() const { return acting_features; }
2504 uint64_t get_min_upacting_features() const { return upacting_features; }
c07f9fc5
FG
2505 bool perform_deletes_during_peering() const {
2506 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2507 }
7c673cae
FG
2508
2509 void init_primary_up_acting(
2510 const vector<int> &newup,
2511 const vector<int> &newacting,
2512 int new_up_primary,
2513 int new_acting_primary) {
2514 actingset.clear();
2515 acting = newacting;
2516 for (uint8_t i = 0; i < acting.size(); ++i) {
2517 if (acting[i] != CRUSH_ITEM_NONE)
2518 actingset.insert(
2519 pg_shard_t(
2520 acting[i],
2521 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2522 }
2523 upset.clear();
2524 up = newup;
2525 for (uint8_t i = 0; i < up.size(); ++i) {
2526 if (up[i] != CRUSH_ITEM_NONE)
2527 upset.insert(
2528 pg_shard_t(
2529 up[i],
2530 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2531 }
2532 if (!pool.info.ec_pool()) {
2533 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2534 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2535 return;
2536 }
2537 up_primary = pg_shard_t();
2538 primary = pg_shard_t();
2539 for (uint8_t i = 0; i < up.size(); ++i) {
2540 if (up[i] == new_up_primary) {
2541 up_primary = pg_shard_t(up[i], shard_id_t(i));
2542 break;
2543 }
2544 }
2545 for (uint8_t i = 0; i < acting.size(); ++i) {
2546 if (acting[i] == new_acting_primary) {
2547 primary = pg_shard_t(acting[i], shard_id_t(i));
2548 break;
2549 }
2550 }
2551 assert(up_primary.osd == new_up_primary);
2552 assert(primary.osd == new_acting_primary);
2553 }
2554 pg_shard_t get_primary() const { return primary; }
2555
2556 int get_role() const { return role; }
2557 void set_role(int r) { role = r; }
2558
2559 bool is_primary() const { return pg_whoami == primary; }
2560 bool is_replica() const { return role > 0; }
2561
2562 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2563
2564 //int get_state() const { return state; }
2565 bool state_test(int m) const { return (state & m) != 0; }
2566 void state_set(int m) { state |= m; }
2567 void state_clear(int m) { state &= ~m; }
2568
2569 bool is_complete() const { return info.last_complete == info.last_update; }
2570 bool should_send_notify() const { return send_notify; }
2571
2572 int get_state() const { return state; }
2573 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2574 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2575 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2576 bool is_down() const { return state_test(PG_STATE_DOWN); }
b32b8144
FG
2577 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2578 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
7c673cae
FG
2579 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2580 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2581 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2582 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2583
2584 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
b32b8144 2585 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
7c673cae
FG
2586 bool is_peered() const {
2587 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2588 }
91327a77 2589 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
7c673cae
FG
2590
2591 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2592
2593 void init(
2594 int role,
2595 const vector<int>& up,
2596 int up_primary,
2597 const vector<int>& acting,
2598 int acting_primary,
2599 const pg_history_t& history,
2600 const PastIntervals& pim,
2601 bool backfill,
2602 ObjectStore::Transaction *t);
2603
2604 // pg on-disk state
2605 void do_pending_flush();
2606
2607 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2608 static void _init(ObjectStore::Transaction& t,
2609 spg_t pgid, const pg_pool_t *pool);
2610
2611private:
2612 void prepare_write_info(map<string,bufferlist> *km);
2613
2614 void update_store_with_options();
2615 void update_store_on_load();
2616
2617public:
2618 static int _prepare_write_info(
2619 CephContext* cct,
2620 map<string,bufferlist> *km,
2621 epoch_t epoch,
2622 pg_info_t &info,
2623 pg_info_t &last_written_info,
2624 PastIntervals &past_intervals,
2625 bool dirty_big_info,
2626 bool dirty_epoch,
2627 bool try_fast_info,
2628 PerfCounters *logger = nullptr);
2629 void write_if_dirty(ObjectStore::Transaction& t);
2630
2631 PGLog::IndexedLog projected_log;
2632 bool check_in_progress_op(
2633 const osd_reqid_t &r,
2634 eversion_t *version,
2635 version_t *user_version,
2636 int *return_code) const;
2637 eversion_t projected_last_update;
2638 eversion_t get_next_version() const {
2639 eversion_t at_version(
2640 get_osdmap()->get_epoch(),
2641 projected_last_update.version+1);
2642 assert(at_version > info.last_update);
2643 assert(at_version > pg_log.get_head());
2644 assert(at_version > projected_last_update);
2645 return at_version;
2646 }
2647
2648 void add_log_entry(const pg_log_entry_t& e, bool applied);
2649 void append_log(
2650 const vector<pg_log_entry_t>& logv,
2651 eversion_t trim_to,
2652 eversion_t roll_forward_to,
2653 ObjectStore::Transaction &t,
2654 bool transaction_applied = true);
2655 bool check_log_for_corruption(ObjectStore *store);
2656 void trim_log();
2657
2658 std::string get_corrupt_pg_log_name() const;
2659 static int read_info(
2660 ObjectStore *store, spg_t pgid, const coll_t &coll,
2661 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2662 __u8 &);
2663 void read_state(ObjectStore *store, bufferlist &bl);
2664 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2665 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2666 epoch_t *pepoch, bufferlist *bl);
2667 void update_snap_map(
2668 const vector<pg_log_entry_t> &log_entries,
2669 ObjectStore::Transaction& t);
2670
2671 void filter_snapc(vector<snapid_t> &snaps);
2672
2673 void log_weirdness();
2674
2675 virtual void kick_snap_trim() = 0;
2676 virtual void snap_trimmer_scrub_complete() = 0;
31f18b77 2677 bool requeue_scrub(bool high_priority = false);
c07f9fc5 2678 void queue_recovery();
7c673cae
FG
2679 bool queue_scrub();
2680 unsigned get_scrub_priority();
2681
2682 /// share pg info after a pg is active
2683 void share_pg_info();
2684
2685
2686 bool append_log_entries_update_missing(
31f18b77 2687 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
2688 ObjectStore::Transaction &t,
2689 boost::optional<eversion_t> trim_to,
2690 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
2691
2692 /**
2693 * Merge entries updating missing as necessary on all
2694 * actingbackfill logs and missings (also missing_loc)
2695 */
2696 void merge_new_log_entries(
31f18b77 2697 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
2698 ObjectStore::Transaction &t,
2699 boost::optional<eversion_t> trim_to,
2700 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
2701
2702 void reset_interval_flush();
2703 void start_peering_interval(
2704 const OSDMapRef lastmap,
2705 const vector<int>& newup, int up_primary,
2706 const vector<int>& newacting, int acting_primary,
2707 ObjectStore::Transaction *t);
2708 void on_new_interval();
2709 virtual void _on_new_interval() = 0;
2710 void start_flush(ObjectStore::Transaction *t,
2711 list<Context *> *on_applied,
2712 list<Context *> *on_safe);
2713 void set_last_peering_reset();
2714 bool pg_has_reset_since(epoch_t e) {
2715 assert(is_locked());
2716 return deleting || e < get_last_peering_reset();
2717 }
2718
2719 void update_history(const pg_history_t& history);
2720 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2721 pair<pg_shard_t, pg_info_t> &notify_info);
2722 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
1adf2230 2723 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
7c673cae
FG
2724 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2725
2726 bool should_restart_peering(
2727 int newupprimary,
2728 int newactingprimary,
2729 const vector<int>& newup,
2730 const vector<int>& newacting,
2731 OSDMapRef lastmap,
2732 OSDMapRef osdmap);
2733
2734 // OpRequest queueing
2735 bool can_discard_op(OpRequestRef& op);
2736 bool can_discard_scan(OpRequestRef op);
2737 bool can_discard_backfill(OpRequestRef op);
2738 bool can_discard_request(OpRequestRef& op);
2739
2740 template<typename T, int MSGTYPE>
2741 bool can_discard_replica_op(OpRequestRef& op);
2742
2743 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2744 bool old_peering_evt(CephPeeringEvtRef evt) {
2745 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2746 }
2747 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2748 return e <= cur_epoch;
2749 }
2750 bool have_same_or_newer_map(epoch_t e) {
2751 return e <= get_osdmap()->get_epoch();
2752 }
2753
2754 bool op_has_sufficient_caps(OpRequestRef& op);
2755
2756
2757 // recovery bits
2758 void take_waiters();
2759 void queue_peering_event(CephPeeringEvtRef evt);
2760 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2761 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2762 pg_shard_t from, const pg_query_t& q);
2763 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2764 void queue_flushed(epoch_t started_at);
2765 void handle_advance_map(
2766 OSDMapRef osdmap, OSDMapRef lastmap,
2767 vector<int>& newup, int up_primary,
2768 vector<int>& newacting, int acting_primary,
2769 RecoveryCtx *rctx);
2770 void handle_activate_map(RecoveryCtx *rctx);
2771 void handle_create(RecoveryCtx *rctx);
2772 void handle_loaded(RecoveryCtx *rctx);
2773 void handle_query_state(Formatter *f);
2774
2775 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2776
2777
2778 // abstract bits
2779 virtual void do_request(
2780 OpRequestRef& op,
2781 ThreadPool::TPHandle &handle
2782 ) = 0;
2783
2784 virtual void do_op(OpRequestRef& op) = 0;
2785 virtual void do_sub_op(OpRequestRef op) = 0;
2786 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2787 virtual void do_scan(
2788 OpRequestRef op,
2789 ThreadPool::TPHandle &handle
2790 ) = 0;
2791 virtual void do_backfill(OpRequestRef op) = 0;
2792 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2793
2794 virtual int do_command(
2795 cmdmap_t cmdmap,
2796 ostream& ss,
2797 bufferlist& idata,
2798 bufferlist& odata,
2799 ConnectionRef conn,
2800 ceph_tid_t tid) = 0;
2801
2802 virtual void on_role_change() = 0;
2803 virtual void on_pool_change() = 0;
2804 virtual void on_change(ObjectStore::Transaction *t) = 0;
2805 virtual void on_activate() = 0;
2806 virtual void on_flushed() = 0;
2807 virtual void on_shutdown() = 0;
2808 virtual void check_blacklisted_watchers() = 0;
2809 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2810
2811 virtual bool agent_work(int max) = 0;
2812 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2813 virtual void agent_stop() = 0;
2814 virtual void agent_delay() = 0;
2815 virtual void agent_clear() = 0;
2816 virtual void agent_choose_mode_restart() = 0;
2817};
2818
2819ostream& operator<<(ostream& out, const PG& pg);
2820
2821ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2822
2823#endif