]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include <boost/container/flat_set.hpp>
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/ceph_assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46 #include "PGPeeringEvent.h"
47
48 #include "mgr/OSDPerfMetricTypes.h"
49
50 #include <atomic>
51 #include <list>
52 #include <memory>
53 #include <stack>
54 #include <string>
55 #include <tuple>
56
57 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
58 //#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
59
60 class OSD;
61 class OSDService;
62 class OSDShard;
63 class OSDShardPGSlot;
64 class MOSDOp;
65 class MOSDPGScan;
66 class MOSDPGBackfill;
67 class MOSDPGInfo;
68
69 class PG;
70 struct OpRequest;
71 typedef OpRequest::Ref OpRequestRef;
72 class MOSDPGLog;
73 class CephContext;
74 class DynamicPerfStats;
75
76 namespace Scrub {
77 class Store;
78 }
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 string get_current_state() {
125 if (pi == nullptr) return "unknown";
126 return std::get<1>(pi->embedded_states.top());
127 }
128
129 private:
130 bool pg_in_destructor = false;
131 PG* thispg = nullptr;
132 std::unique_ptr<PGStateInstance> tmppi;
133 PGStateInstance* pi = nullptr;
134 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
135
136 };
137
138 #ifdef PG_DEBUG_REFS
139 #include "common/tracked_int_ptr.hpp"
140 uint64_t get_with_id(PG *pg);
141 void put_with_id(PG *pg, uint64_t id);
142 typedef TrackedIntPtr<PG> PGRef;
143 #else
144 typedef boost::intrusive_ptr<PG> PGRef;
145 #endif
146
147 class PGRecoveryStats {
148 struct per_state_info {
149 uint64_t enter, exit; // enter/exit counts
150 uint64_t events;
151 utime_t event_time; // time spent processing events
152 utime_t total_time; // total time in state
153 utime_t min_time, max_time;
154
155 // cppcheck-suppress unreachableCode
156 per_state_info() : enter(0), exit(0), events(0) {}
157 };
158 map<const char *,per_state_info> info;
159 Mutex lock;
160
161 public:
162 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
163
164 void reset() {
165 std::lock_guard l(lock);
166 info.clear();
167 }
168 void dump(ostream& out) {
169 std::lock_guard l(lock);
170 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
171 per_state_info& i = p->second;
172 out << i.enter << "\t" << i.exit << "\t"
173 << i.events << "\t" << i.event_time << "\t"
174 << i.total_time << "\t"
175 << i.min_time << "\t" << i.max_time << "\t"
176 << p->first << "\n";
177 }
178 }
179
180 void dump_formatted(Formatter *f) {
181 std::lock_guard l(lock);
182 f->open_array_section("pg_recovery_stats");
183 for (map<const char *,per_state_info>::iterator p = info.begin();
184 p != info.end(); ++p) {
185 per_state_info& i = p->second;
186 f->open_object_section("recovery_state");
187 f->dump_int("enter", i.enter);
188 f->dump_int("exit", i.exit);
189 f->dump_int("events", i.events);
190 f->dump_stream("event_time") << i.event_time;
191 f->dump_stream("total_time") << i.total_time;
192 f->dump_stream("min_time") << i.min_time;
193 f->dump_stream("max_time") << i.max_time;
194 vector<string> states;
195 get_str_vec(p->first, "/", states);
196 f->open_array_section("nested_states");
197 for (vector<string>::iterator st = states.begin();
198 st != states.end(); ++st) {
199 f->dump_string("state", *st);
200 }
201 f->close_section();
202 f->close_section();
203 }
204 f->close_section();
205 }
206
207 void log_enter(const char *s) {
208 std::lock_guard l(lock);
209 info[s].enter++;
210 }
211 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
212 std::lock_guard l(lock);
213 per_state_info &i = info[s];
214 i.exit++;
215 i.total_time += dur;
216 if (dur > i.max_time)
217 i.max_time = dur;
218 if (dur < i.min_time || i.min_time == utime_t())
219 i.min_time = dur;
220 i.events += events;
221 i.event_time += event_dur;
222 }
223 };
224
225 struct PGPool {
226 CephContext* cct;
227 epoch_t cached_epoch;
228 int64_t id;
229 string name;
230
231 pg_pool_t info;
232 SnapContext snapc; // the default pool snapc, ready to go.
233
234 // these two sets are for < mimic only
235 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
236 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
237
238 PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
239 const string& name)
240 : cct(cct),
241 cached_epoch(map->get_epoch()),
242 id(i),
243 name(name),
244 info(info) {
245 snapc = info.get_snap_context();
246 if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
247 info.build_removed_snaps(cached_removed_snaps);
248 }
249 }
250
251 void update(CephContext *cct, OSDMapRef map);
252 };
253
254 /** PG - Replica Placement Group
255 *
256 */
257
258 class PG : public DoutPrefixProvider {
259 public:
260 // -- members --
261 const spg_t pg_id;
262 const coll_t coll;
263
264 ObjectStore::CollectionHandle ch;
265
266 struct RecoveryCtx;
267
268 // -- methods --
269 std::ostream& gen_prefix(std::ostream& out) const override;
270 CephContext *get_cct() const override {
271 return cct;
272 }
273 unsigned get_subsys() const override {
274 return ceph_subsys_osd;
275 }
276
277 const OSDMapRef& get_osdmap() const {
278 ceph_assert(is_locked());
279 ceph_assert(osdmap_ref);
280 return osdmap_ref;
281 }
282 epoch_t get_osdmap_epoch() const {
283 return osdmap_ref->get_epoch();
284 }
285
286 void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
287 handle.suspend_tp_timeout();
288 lock();
289 handle.reset_tp_timeout();
290 }
291 void lock(bool no_lockdep = false) const;
292 void unlock() const {
293 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
294 ceph_assert(!dirty_info);
295 ceph_assert(!dirty_big_info);
296 _lock.Unlock();
297 }
298 bool is_locked() const {
299 return _lock.is_locked();
300 }
301
302 const spg_t& get_pgid() const {
303 return pg_id;
304 }
305
306 const PGPool& get_pool() const {
307 return pool;
308 }
309 uint64_t get_last_user_version() const {
310 return info.last_user_version;
311 }
312 const pg_history_t& get_history() const {
313 return info.history;
314 }
315 bool get_need_up_thru() const {
316 return need_up_thru;
317 }
318 epoch_t get_same_interval_since() const {
319 return info.history.same_interval_since;
320 }
321
322 void set_last_scrub_stamp(utime_t t) {
323 info.stats.last_scrub_stamp = t;
324 info.history.last_scrub_stamp = t;
325 }
326
327 void set_last_deep_scrub_stamp(utime_t t) {
328 info.stats.last_deep_scrub_stamp = t;
329 info.history.last_deep_scrub_stamp = t;
330 }
331
332 bool is_deleting() const {
333 return deleting;
334 }
335 bool is_deleted() const {
336 return deleted;
337 }
338 bool is_replica() const {
339 return role > 0;
340 }
341 bool is_primary() const {
342 return pg_whoami == primary;
343 }
344 bool pg_has_reset_since(epoch_t e) {
345 ceph_assert(is_locked());
346 return deleted || e < get_last_peering_reset();
347 }
348
349 bool is_ec_pg() const {
350 return pool.info.is_erasure();
351 }
352 int get_role() const {
353 return role;
354 }
355 const vector<int> get_acting() const {
356 return acting;
357 }
358 int get_acting_primary() const {
359 return primary.osd;
360 }
361 pg_shard_t get_primary() const {
362 return primary;
363 }
364 const vector<int> get_up() const {
365 return up;
366 }
367 int get_up_primary() const {
368 return up_primary.osd;
369 }
370 const PastIntervals& get_past_intervals() const {
371 return past_intervals;
372 }
373
374 /// initialize created PG
375 void init(
376 int role,
377 const vector<int>& up,
378 int up_primary,
379 const vector<int>& acting,
380 int acting_primary,
381 const pg_history_t& history,
382 const PastIntervals& pim,
383 bool backfill,
384 ObjectStore::Transaction *t);
385
386 /// read existing pg state off disk
387 void read_state(ObjectStore *store);
388 static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
389
390 static int get_latest_struct_v() {
391 return latest_struct_v;
392 }
393 static int get_compat_struct_v() {
394 return compat_struct_v;
395 }
396 static int read_info(
397 ObjectStore *store, spg_t pgid, const coll_t &coll,
398 pg_info_t &info, PastIntervals &past_intervals,
399 __u8 &);
400 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
401
402 void rm_backoff(BackoffRef b);
403
404 void update_snap_mapper_bits(uint32_t bits) {
405 snap_mapper.update_bits(bits);
406 }
407 void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
408 virtual void split_colls(
409 spg_t child,
410 int split_bits,
411 int seed,
412 const pg_pool_t *pool,
413 ObjectStore::Transaction *t) = 0;
414 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
415 void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
416 unsigned split_bits,
417 const pg_merge_meta_t& last_pg_merge_meta);
418 void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
419
420 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
421 void reg_next_scrub();
422 void unreg_next_scrub();
423
424 bool is_forced_recovery_or_backfill() const {
425 return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
426 }
427 bool set_force_recovery(bool b);
428 bool set_force_backfill(bool b);
429
430 void queue_peering_event(PGPeeringEventRef evt);
431 void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
432 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
433 void queue_flushed(epoch_t started_at);
434 void handle_advance_map(
435 OSDMapRef osdmap, OSDMapRef lastmap,
436 vector<int>& newup, int up_primary,
437 vector<int>& newacting, int acting_primary,
438 RecoveryCtx *rctx);
439 void handle_activate_map(RecoveryCtx *rctx);
440 void handle_initialize(RecoveryCtx *rctx);
441 void handle_query_state(Formatter *f);
442
443 /**
444 * @param ops_begun returns how many recovery ops the function started
445 * @returns true if any useful work was accomplished; false otherwise
446 */
447 virtual bool start_recovery_ops(
448 uint64_t max,
449 ThreadPool::TPHandle &handle,
450 uint64_t *ops_begun) = 0;
451
452 // more work after the above, but with a RecoveryCtx
453 void find_unfound(epoch_t queued, RecoveryCtx *rctx);
454
455 virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
456
457 void dump_pgstate_history(Formatter *f);
458 void dump_missing(Formatter *f);
459
460 void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
461 void with_heartbeat_peers(std::function<void(int)> f);
462
463 void shutdown();
464 virtual void on_shutdown() = 0;
465
466 bool get_must_scrub() const {
467 return scrubber.must_scrub;
468 }
469 bool sched_scrub();
470
471 virtual void do_request(
472 OpRequestRef& op,
473 ThreadPool::TPHandle &handle
474 ) = 0;
475 virtual void clear_cache() = 0;
476 virtual int get_cache_obj_count() = 0;
477
478 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
479 virtual int do_command(
480 cmdmap_t cmdmap,
481 ostream& ss,
482 bufferlist& idata,
483 bufferlist& odata,
484 ConnectionRef conn,
485 ceph_tid_t tid) = 0;
486
487 virtual bool agent_work(int max) = 0;
488 virtual bool agent_work(int max, int agent_flush_quota) = 0;
489 virtual void agent_stop() = 0;
490 virtual void agent_delay() = 0;
491 virtual void agent_clear() = 0;
492 virtual void agent_choose_mode_restart() = 0;
493
494 virtual void on_removal(ObjectStore::Transaction *t) = 0;
495
496 void _delete_some(ObjectStore::Transaction *t);
497
498 virtual void set_dynamic_perf_stats_queries(
499 const std::list<OSDPerfMetricQuery> &queries) {
500 }
501 virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
502 }
503
504 // reference counting
505 #ifdef PG_DEBUG_REFS
506 uint64_t get_with_id();
507 void put_with_id(uint64_t);
508 void dump_live_ids();
509 #endif
510 void get(const char* tag);
511 void put(const char* tag);
512 int get_num_ref() {
513 return ref;
514 }
515
516 // ctor
517 PG(OSDService *o, OSDMapRef curmap,
518 const PGPool &pool, spg_t p);
519 ~PG() override;
520
521 // prevent copying
522 explicit PG(const PG& rhs) = delete;
523 PG& operator=(const PG& rhs) = delete;
524
525 protected:
526 // -------------
527 // protected
528 OSDService *osd;
529 public:
530 OSDShard *osd_shard = nullptr;
531 OSDShardPGSlot *pg_slot = nullptr;
532 protected:
533 CephContext *cct;
534
535 // osdmap
536 OSDMapRef osdmap_ref;
537
538 PGPool pool;
539
540 // locking and reference counting.
541 // I destroy myself when the reference count hits zero.
542 // lock() should be called before doing anything.
543 // get() should be called on pointer copy (to another thread, etc.).
544 // put() should be called on destruction of some previously copied pointer.
545 // unlock() when done with the current pointer (_most common_).
546 mutable Mutex _lock = {"PG::_lock"};
547
548 std::atomic<unsigned int> ref{0};
549
550 #ifdef PG_DEBUG_REFS
551 Mutex _ref_id_lock = {"PG::_ref_id_lock"};
552 map<uint64_t, string> _live_ids;
553 map<string, uint64_t> _tag_counts;
554 uint64_t _ref_id = 0;
555
556 friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
557 friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
558 #endif
559
560 private:
561 friend void intrusive_ptr_add_ref(PG *pg) {
562 pg->get("intptr");
563 }
564 friend void intrusive_ptr_release(PG *pg) {
565 pg->put("intptr");
566 }
567
568
569 // =====================
570
571 protected:
572 OSDriver osdriver;
573 SnapMapper snap_mapper;
574 bool eio_errors_to_process = false;
575
576 virtual PGBackend *get_pgbackend() = 0;
577 virtual const PGBackend* get_pgbackend() const = 0;
578
579 protected:
580 /*** PG ****/
581 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
582 IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
583 return get_pgbackend()->get_is_recoverable_predicate();
584 }
585 protected:
586 epoch_t last_persisted_osdmap;
587
588 void requeue_map_waiters();
589
590 void update_osdmap_ref(OSDMapRef newmap) {
591 ceph_assert(_lock.is_locked_by_me());
592 osdmap_ref = std::move(newmap);
593 }
594
595 protected:
596
597
598 bool deleting; // true while in removing or OSD is shutting down
599 atomic<bool> deleted = {false};
600
601 ZTracer::Endpoint trace_endpoint;
602
603
604 protected:
605 bool dirty_info, dirty_big_info;
606
607 protected:
608 // pg state
609 pg_info_t info; ///< current pg info
610 pg_info_t last_written_info; ///< last written info
611 __u8 info_struct_v = 0;
612 static const __u8 latest_struct_v = 10;
613 // v10 is the new past_intervals encoding
614 // v9 was fastinfo_key addition
615 // v8 was the move to a per-pg pgmeta object
616 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
617 // (first appeared in cuttlefish).
618 static const __u8 compat_struct_v = 10;
619 void upgrade(ObjectStore *store);
620
621 protected:
622 PGLog pg_log;
623 ghobject_t pgmeta_oid;
624
625 // ------------------
626 // MissingLoc
627
628 class MissingLoc {
629 public:
630 // a loc_count indicates how many locations we know in each of
631 // these distinct sets
632 struct loc_count_t {
633 int up = 0; //< up
634 int other = 0; //< other
635
636 friend bool operator<(const loc_count_t& l,
637 const loc_count_t& r) {
638 return (l.up < r.up ||
639 (l.up == r.up &&
640 (l.other < r.other)));
641 }
642 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
643 ceph_assert(l.up >= 0);
644 ceph_assert(l.other >= 0);
645 return out << "(" << l.up << "+" << l.other << ")";
646 }
647 };
648
649
650 private:
651
652 loc_count_t _get_count(const set<pg_shard_t>& shards) {
653 loc_count_t r;
654 for (auto s : shards) {
655 if (pg->upset.count(s)) {
656 r.up++;
657 } else {
658 r.other++;
659 }
660 }
661 return r;
662 }
663
664 map<hobject_t, pg_missing_item> needs_recovery_map;
665 map<hobject_t, set<pg_shard_t> > missing_loc;
666 set<pg_shard_t> missing_loc_sources;
667
668 // for every entry in missing_loc, we count how many of each type of shard we have,
669 // and maintain totals here. The sum of the values for this map will always equal
670 // missing_loc.size().
671 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
672
673 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
674 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
675 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
676 // For completely missing shards initialize with empty set<pg_shard_t>
677 for (int i = 0 ; i < num_shards ; ++i) {
678 shard_id_t shard(i);
679 pgsbs[shard];
680 }
681 for (auto pgs: s)
682 pgsbs[pgs.shard].insert(pgs);
683 } else {
684 pgsbs[shard_id_t::NO_SHARD] = s;
685 }
686 }
687
688 void _inc_count(const set<pg_shard_t>& s) {
689 map< shard_id_t, set<pg_shard_t> > pgsbs;
690 pgs_by_shard_id(s, pgsbs);
691 for (auto shard: pgsbs)
692 ++missing_by_count[shard.first][_get_count(shard.second)];
693 }
694 void _dec_count(const set<pg_shard_t>& s) {
695 map< shard_id_t, set<pg_shard_t> > pgsbs;
696 pgs_by_shard_id(s, pgsbs);
697 for (auto shard: pgsbs) {
698 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
699 ceph_assert(p != missing_by_count[shard.first].end());
700 if (--p->second == 0) {
701 missing_by_count[shard.first].erase(p);
702 }
703 }
704 }
705
706 PG *pg;
707 set<pg_shard_t> empty_set;
708 public:
709 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
710 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
711 explicit MissingLoc(PG *pg)
712 : pg(pg) { }
713 void set_backend_predicates(
714 IsPGReadablePredicate *_is_readable,
715 IsPGRecoverablePredicate *_is_recoverable) {
716 is_readable.reset(_is_readable);
717 is_recoverable.reset(_is_recoverable);
718 }
719 std::ostream& gen_prefix(std::ostream& out) const {
720 return pg->gen_prefix(out);
721 }
722 bool needs_recovery(
723 const hobject_t &hoid,
724 eversion_t *v = 0) const {
725 map<hobject_t, pg_missing_item>::const_iterator i =
726 needs_recovery_map.find(hoid);
727 if (i == needs_recovery_map.end())
728 return false;
729 if (v)
730 *v = i->second.need;
731 return true;
732 }
733 bool is_deleted(const hobject_t &hoid) const {
734 auto i = needs_recovery_map.find(hoid);
735 if (i == needs_recovery_map.end())
736 return false;
737 return i->second.is_delete();
738 }
739 bool is_unfound(const hobject_t &hoid) const {
740 auto it = needs_recovery_map.find(hoid);
741 if (it == needs_recovery_map.end()) {
742 return false;
743 }
744 if (it->second.is_delete()) {
745 return false;
746 }
747 auto mit = missing_loc.find(hoid);
748 return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
749 }
750 bool readable_with_acting(
751 const hobject_t &hoid,
752 const set<pg_shard_t> &acting) const;
753 uint64_t num_unfound() const {
754 uint64_t ret = 0;
755 for (map<hobject_t, pg_missing_item>::const_iterator i =
756 needs_recovery_map.begin();
757 i != needs_recovery_map.end();
758 ++i) {
759 if (i->second.is_delete())
760 continue;
761 auto mi = missing_loc.find(i->first);
762 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
763 ++ret;
764 }
765 return ret;
766 }
767
768 bool have_unfound() const {
769 for (map<hobject_t, pg_missing_item>::const_iterator i =
770 needs_recovery_map.begin();
771 i != needs_recovery_map.end();
772 ++i) {
773 if (i->second.is_delete())
774 continue;
775 auto mi = missing_loc.find(i->first);
776 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
777 return true;
778 }
779 return false;
780 }
781 void clear() {
782 needs_recovery_map.clear();
783 missing_loc.clear();
784 missing_loc_sources.clear();
785 missing_by_count.clear();
786 }
787
788 void add_location(const hobject_t &hoid, pg_shard_t location) {
789 auto p = missing_loc.find(hoid);
790 if (p == missing_loc.end()) {
791 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
792 } else {
793 _dec_count(p->second);
794 }
795 p->second.insert(location);
796 _inc_count(p->second);
797 }
798 void remove_location(const hobject_t &hoid, pg_shard_t location) {
799 auto p = missing_loc.find(hoid);
800 if (p != missing_loc.end()) {
801 _dec_count(p->second);
802 p->second.erase(location);
803 if (p->second.empty()) {
804 missing_loc.erase(p);
805 } else {
806 _inc_count(p->second);
807 }
808 }
809 }
810
811 void clear_location(const hobject_t &hoid) {
812 auto p = missing_loc.find(hoid);
813 if (p != missing_loc.end()) {
814 _dec_count(p->second);
815 missing_loc.erase(p);
816 }
817 }
818
819 void add_active_missing(const pg_missing_t &missing) {
820 for (map<hobject_t, pg_missing_item>::const_iterator i =
821 missing.get_items().begin();
822 i != missing.get_items().end();
823 ++i) {
824 map<hobject_t, pg_missing_item>::const_iterator j =
825 needs_recovery_map.find(i->first);
826 if (j == needs_recovery_map.end()) {
827 needs_recovery_map.insert(*i);
828 } else {
829 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
830 << i->first << " have " << j->second
831 << " tried to add " << i->second << dendl;
832 ceph_assert(i->second.need == j->second.need);
833 }
834 }
835 }
836
837 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
838 needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
839 }
840 void revise_need(const hobject_t &hoid, eversion_t need) {
841 auto it = needs_recovery_map.find(hoid);
842 ceph_assert(it != needs_recovery_map.end());
843 it->second.need = need;
844 }
845
846 /// Adds info about a possible recovery source
847 bool add_source_info(
848 pg_shard_t source, ///< [in] source
849 const pg_info_t &oinfo, ///< [in] info
850 const pg_missing_t &omissing, ///< [in] (optional) missing
851 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
852 ); ///< @return whether a new object location was discovered
853
854 /// Adds recovery sources in batch
855 void add_batch_sources_info(
856 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
857 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
858 );
859
860 /// Uses osdmap to update structures for now down sources
861 void check_recovery_sources(const OSDMapRef& osdmap);
862
863 /// Call when hoid is no longer missing in acting set
864 void recovered(const hobject_t &hoid) {
865 needs_recovery_map.erase(hoid);
866 auto p = missing_loc.find(hoid);
867 if (p != missing_loc.end()) {
868 _dec_count(p->second);
869 missing_loc.erase(p);
870 }
871 }
872
873 /// Call to update structures for hoid after a change
874 void rebuild(
875 const hobject_t &hoid,
876 pg_shard_t self,
877 const set<pg_shard_t> to_recover,
878 const pg_info_t &info,
879 const pg_missing_t &missing,
880 const map<pg_shard_t, pg_missing_t> &pmissing,
881 const map<pg_shard_t, pg_info_t> &pinfo) {
882 recovered(hoid);
883 boost::optional<pg_missing_item> item;
884 auto miter = missing.get_items().find(hoid);
885 if (miter != missing.get_items().end()) {
886 item = miter->second;
887 } else {
888 for (auto &&i: to_recover) {
889 if (i == self)
890 continue;
891 auto pmiter = pmissing.find(i);
892 ceph_assert(pmiter != pmissing.end());
893 miter = pmiter->second.get_items().find(hoid);
894 if (miter != pmiter->second.get_items().end()) {
895 item = miter->second;
896 break;
897 }
898 }
899 }
900 if (!item)
901 return; // recovered!
902
903 needs_recovery_map[hoid] = *item;
904 if (item->is_delete())
905 return;
906 auto mliter =
907 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
908 ceph_assert(info.last_backfill.is_max());
909 ceph_assert(info.last_update >= item->need);
910 if (!missing.is_missing(hoid))
911 mliter->second.insert(self);
912 for (auto &&i: pmissing) {
913 if (i.first == self)
914 continue;
915 auto pinfoiter = pinfo.find(i.first);
916 ceph_assert(pinfoiter != pinfo.end());
917 if (item->need <= pinfoiter->second.last_update &&
918 hoid <= pinfoiter->second.last_backfill &&
919 !i.second.is_missing(hoid))
920 mliter->second.insert(i.first);
921 }
922 _inc_count(mliter->second);
923 }
924
925 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
926 auto it = missing_loc.find(hoid);
927 return it == missing_loc.end() ? empty_set : it->second;
928 }
929 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
930 return missing_loc;
931 }
932 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
933 return needs_recovery_map;
934 }
935 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
936 return missing_by_count;
937 }
938 } missing_loc;
939
940 PastIntervals past_intervals;
941
942 interval_set<snapid_t> snap_trimq;
943
944 /* You should not use these items without taking their respective queue locks
945 * (if they have one) */
946 xlist<PG*>::item stat_queue_item;
947 bool scrub_queued;
948 bool recovery_queued;
949
950 int recovery_ops_active;
951 set<pg_shard_t> waiting_on_backfill;
952 #ifdef DEBUG_RECOVERY_OIDS
953 multiset<hobject_t> recovering_oids;
954 #endif
955
956 protected:
957 int role; // 0 = primary, 1 = replica, -1=none.
958 uint64_t state; // PG_STATE_*
959
960 bool send_notify; ///< true if we are non-primary and should notify the primary
961
962 protected:
963 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
964 eversion_t last_complete_ondisk; // last_complete that has committed.
965 eversion_t last_update_applied;
966
967 // entries <= last_rollback_info_trimmed_to_applied have been trimmed
968 eversion_t last_rollback_info_trimmed_to_applied;
969
970 // primary state
971 protected:
972 pg_shard_t primary;
973 pg_shard_t pg_whoami;
974 pg_shard_t up_primary;
975 vector<int> up, acting, want_acting;
976 // acting_recovery_backfill contains shards that are acting,
977 // async recovery targets, or backfill targets.
978 set<pg_shard_t> acting_recovery_backfill, actingset, upset;
979 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
980 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
981 eversion_t pg_trim_to;
982
983 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
984
985 protected:
986 // [primary only] content recovery state
987 struct BufferedRecoveryMessages {
988 map<int, map<spg_t, pg_query_t> > query_map;
989 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
990 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
991 };
992
993 public:
994 bool dne() { return info.dne(); }
995 struct RecoveryCtx {
996 utime_t start_time;
997 map<int, map<spg_t, pg_query_t> > *query_map;
998 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
999 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
1000 ObjectStore::Transaction *transaction;
1001 ThreadPool::TPHandle* handle;
1002 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1003 map<int,
1004 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1005 map<int,
1006 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
1007 ObjectStore::Transaction *transaction)
1008 : query_map(query_map), info_map(info_map),
1009 notify_list(notify_list),
1010 transaction(transaction),
1011 handle(NULL) {}
1012
1013 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1014 : query_map(&(buf.query_map)),
1015 info_map(&(buf.info_map)),
1016 notify_list(&(buf.notify_list)),
1017 transaction(rctx.transaction),
1018 handle(rctx.handle) {}
1019
1020 void accept_buffered_messages(BufferedRecoveryMessages &m) {
1021 ceph_assert(query_map);
1022 ceph_assert(info_map);
1023 ceph_assert(notify_list);
1024 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1025 i != m.query_map.end();
1026 ++i) {
1027 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1028 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1029 j != i->second.end();
1030 ++j) {
1031 omap[j->first] = j->second;
1032 }
1033 }
1034 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1035 = m.info_map.begin();
1036 i != m.info_map.end();
1037 ++i) {
1038 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1039 (*info_map)[i->first];
1040 ovec.reserve(ovec.size() + i->second.size());
1041 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1042 }
1043 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1044 = m.notify_list.begin();
1045 i != m.notify_list.end();
1046 ++i) {
1047 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1048 (*notify_list)[i->first];
1049 ovec.reserve(ovec.size() + i->second.size());
1050 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1051 }
1052 }
1053
1054 void send_notify(pg_shard_t to,
1055 const pg_notify_t &info, const PastIntervals &pi) {
1056 ceph_assert(notify_list);
1057 (*notify_list)[to.osd].push_back(make_pair(info, pi));
1058 }
1059 };
1060 protected:
1061
1062 PGStateHistory pgstate_history;
1063
1064 struct NamedState {
1065 const char *state_name;
1066 utime_t enter_time;
1067 PG* pg;
1068 const char *get_state_name() { return state_name; }
1069 NamedState(PG *pg_, const char *state_name_)
1070 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1071 pg->pgstate_history.enter(pg, enter_time, state_name);
1072 }
1073 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1074 };
1075
1076
1077
1078 protected:
1079
1080 /*
1081 * peer_info -- projected (updates _before_ replicas ack)
1082 * peer_missing -- committed (updates _after_ replicas ack)
1083 */
1084
1085 bool need_up_thru;
1086 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
1087 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
1088 map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
1089 set<pg_shard_t> peer_purged; // peers purged
1090 map<pg_shard_t, pg_missing_t> peer_missing;
1091 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
1092 set<pg_shard_t> peer_missing_requested;
1093
1094 // i deleted these strays; ignore racing PGInfo from them
1095 set<pg_shard_t> peer_activated;
1096
1097 // primary-only, recovery-only state
1098 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
1099 // which are unfound on the primary
1100 epoch_t last_peering_reset;
1101
1102 epoch_t get_last_peering_reset() const {
1103 return last_peering_reset;
1104 }
1105
1106 /* heartbeat peers */
1107 void set_probe_targets(const set<pg_shard_t> &probe_set);
1108 void clear_probe_targets();
1109
1110 Mutex heartbeat_peer_lock;
1111 set<int> heartbeat_peers;
1112 set<int> probe_targets;
1113
1114 public:
1115 /**
1116 * BackfillInterval
1117 *
1118 * Represents the objects in a range [begin, end)
1119 *
1120 * Possible states:
1121 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1122 * 2) Else, objects contains all objects in [begin, end)
1123 */
1124 struct BackfillInterval {
1125 // info about a backfill interval on a peer
1126 eversion_t version; /// version at which the scan occurred
1127 map<hobject_t,eversion_t> objects;
1128 hobject_t begin;
1129 hobject_t end;
1130
1131 /// clear content
1132 void clear() {
1133 *this = BackfillInterval();
1134 }
1135
1136 /// clear objects list only
1137 void clear_objects() {
1138 objects.clear();
1139 }
1140
1141 /// reinstantiate with a new start+end position and sort order
1142 void reset(hobject_t start) {
1143 clear();
1144 begin = end = start;
1145 }
1146
1147 /// true if there are no objects in this interval
1148 bool empty() const {
1149 return objects.empty();
1150 }
1151
1152 /// true if interval extends to the end of the range
1153 bool extends_to_end() const {
1154 return end.is_max();
1155 }
1156
1157 /// removes items <= soid and adjusts begin to the first object
1158 void trim_to(const hobject_t &soid) {
1159 trim();
1160 while (!objects.empty() &&
1161 objects.begin()->first <= soid) {
1162 pop_front();
1163 }
1164 }
1165
1166 /// Adjusts begin to the first object
1167 void trim() {
1168 if (!objects.empty())
1169 begin = objects.begin()->first;
1170 else
1171 begin = end;
1172 }
1173
1174 /// drop first entry, and adjust @begin accordingly
1175 void pop_front() {
1176 ceph_assert(!objects.empty());
1177 objects.erase(objects.begin());
1178 trim();
1179 }
1180
1181 /// dump
1182 void dump(Formatter *f) const {
1183 f->dump_stream("begin") << begin;
1184 f->dump_stream("end") << end;
1185 f->open_array_section("objects");
1186 for (map<hobject_t, eversion_t>::const_iterator i =
1187 objects.begin();
1188 i != objects.end();
1189 ++i) {
1190 f->open_object_section("object");
1191 f->dump_stream("object") << i->first;
1192 f->dump_stream("version") << i->second;
1193 f->close_section();
1194 }
1195 f->close_section();
1196 }
1197 };
1198
1199 protected:
1200 BackfillInterval backfill_info;
1201 map<pg_shard_t, BackfillInterval> peer_backfill_info;
1202 bool backfill_reserved;
1203 bool backfill_reserving;
1204
1205 set<pg_shard_t> backfill_targets, async_recovery_targets;
1206
1207 // The primary's num_bytes and local num_bytes for this pg, only valid
1208 // during backfill for non-primary shards.
1209 // Both of these are adjusted for EC to reflect the on-disk bytes
1210 std::atomic<int64_t> primary_num_bytes = 0;
1211 std::atomic<int64_t> local_num_bytes = 0;
1212
1213 public:
1214 bool is_backfill_targets(pg_shard_t osd) {
1215 return backfill_targets.count(osd);
1216 }
1217
1218 // Space reserved for backfill is primary_num_bytes - local_num_bytes
1219 // Don't care that difference itself isn't atomic
1220 uint64_t get_reserved_num_bytes() {
1221 int64_t primary = primary_num_bytes.load();
1222 int64_t local = local_num_bytes.load();
1223 if (primary > local)
1224 return primary - local;
1225 else
1226 return 0;
1227 }
1228
1229 bool is_remote_backfilling() {
1230 return primary_num_bytes.load() > 0;
1231 }
1232
1233 void set_reserved_num_bytes(int64_t primary, int64_t local);
1234 void clear_reserved_num_bytes();
1235
1236 // If num_bytes are inconsistent and local_num- goes negative
1237 // it's ok, because it would then be ignored.
1238
1239 // The value of num_bytes could be negative,
1240 // but we don't let local_num_bytes go negative.
1241 void add_local_num_bytes(int64_t num_bytes) {
1242 if (num_bytes) {
1243 int64_t prev_bytes = local_num_bytes.load();
1244 int64_t new_bytes;
1245 do {
1246 new_bytes = prev_bytes + num_bytes;
1247 if (new_bytes < 0)
1248 new_bytes = 0;
1249 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1250 }
1251 }
1252 void sub_local_num_bytes(int64_t num_bytes) {
1253 ceph_assert(num_bytes >= 0);
1254 if (num_bytes) {
1255 int64_t prev_bytes = local_num_bytes.load();
1256 int64_t new_bytes;
1257 do {
1258 new_bytes = prev_bytes - num_bytes;
1259 if (new_bytes < 0)
1260 new_bytes = 0;
1261 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1262 }
1263 }
1264 // The value of num_bytes could be negative,
1265 // but we don't let info.stats.stats.sum.num_bytes go negative.
1266 void add_num_bytes(int64_t num_bytes) {
1267 ceph_assert(_lock.is_locked_by_me());
1268 if (num_bytes) {
1269 info.stats.stats.sum.num_bytes += num_bytes;
1270 if (info.stats.stats.sum.num_bytes < 0) {
1271 info.stats.stats.sum.num_bytes = 0;
1272 }
1273 }
1274 }
1275 void sub_num_bytes(int64_t num_bytes) {
1276 ceph_assert(_lock.is_locked_by_me());
1277 ceph_assert(num_bytes >= 0);
1278 if (num_bytes) {
1279 info.stats.stats.sum.num_bytes -= num_bytes;
1280 if (info.stats.stats.sum.num_bytes < 0) {
1281 info.stats.stats.sum.num_bytes = 0;
1282 }
1283 }
1284 }
1285
1286 // Only used in testing so not worried about needing the PG lock here
1287 int64_t get_stats_num_bytes() {
1288 Mutex::Locker l(_lock);
1289 int num_bytes = info.stats.stats.sum.num_bytes;
1290 if (pool.info.is_erasure()) {
1291 num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1292 // Round up each object by a stripe
1293 num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1294 }
1295 int64_t lnb = local_num_bytes.load();
1296 if (lnb && lnb != num_bytes) {
1297 lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1298 << lnb << " vs stats "
1299 << info.stats.stats.sum.num_bytes << " / chunk "
1300 << get_pgbackend()->get_ec_data_chunk_count()
1301 << dendl;
1302 }
1303 return num_bytes;
1304 }
1305
1306 protected:
1307
1308 /*
1309 * blocked request wait hierarchy
1310 *
1311 * In order to preserve request ordering we need to be careful about the
1312 * order in which blocked requests get requeued. Generally speaking, we
1313 * push the requests back up to the op_wq in reverse order (most recent
1314 * request first) so that they come back out again in the original order.
1315 * However, because there are multiple wait queues, we need to requeue
1316 * waitlists in order. Generally speaking, we requeue the wait lists
1317 * that are checked first.
1318 *
1319 * Here are the various wait lists, in the order they are used during
1320 * request processing, with notes:
1321 *
1322 * - waiting_for_map
1323 * - may start or stop blocking at any time (depending on client epoch)
1324 * - waiting_for_peered
1325 * - !is_peered() or flushes_in_progress
1326 * - only starts blocking on interval change; never restarts
1327 * - waiting_for_active
1328 * - !is_active()
1329 * - only starts blocking on interval change; never restarts
1330 * - waiting_for_flush
1331 * - is_active() and flushes_in_progress
1332 * - waiting for final flush during activate
1333 * - waiting_for_scrub
1334 * - starts and stops blocking for varying intervals during scrub
1335 * - waiting_for_unreadable_object
1336 * - never restarts once object is readable (* except for EIO?)
1337 * - waiting_for_degraded_object
1338 * - never restarts once object is writeable (* except for EIO?)
1339 * - waiting_for_blocked_object
1340 * - starts and stops based on proxied op activity
1341 * - obc rwlocks
1342 * - starts and stops based on read/write activity
1343 *
1344 * Notes:
1345 *
1346 * 1. During and interval change, we requeue *everything* in the above order.
1347 *
1348 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1349 * the op there if it applies. We ignore the unreadable/degraded/blocked
1350 * queues because we assume they cannot apply at that time (this is
1351 * probably mostly true).
1352 *
1353 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1354 * it is non-empty.
1355 *
1356 * These three behaviors are generally sufficient to maintain ordering, with
1357 * the possible exception of cases where we make an object degraded or
1358 * unreadable that was previously okay, e.g. when scrub or op processing
1359 * encounter an unexpected error. FIXME.
1360 */
1361
1362 // pg waiters
1363 unsigned flushes_in_progress;
1364
1365 // ops with newer maps than our (or blocked behind them)
1366 // track these by client, since inter-request ordering doesn't otherwise
1367 // matter.
1368 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1369
1370 // ops waiting on peered
1371 list<OpRequestRef> waiting_for_peered;
1372
1373 // ops waiting on active (require peered as well)
1374 list<OpRequestRef> waiting_for_active;
1375 list<OpRequestRef> waiting_for_flush;
1376 list<OpRequestRef> waiting_for_scrub;
1377
1378 list<OpRequestRef> waiting_for_cache_not_full;
1379 list<OpRequestRef> waiting_for_clean_to_primary_repair;
1380 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1381 waiting_for_degraded_object,
1382 waiting_for_blocked_object;
1383
1384 set<hobject_t> objects_blocked_on_cache_full;
1385 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1386 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1387
1388 // Callbacks should assume pg (and nothing else) is locked
1389 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1390
1391 map<eversion_t,
1392 list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
1393
1394 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1395 void requeue_op(OpRequestRef op);
1396 void requeue_ops(list<OpRequestRef> &l);
1397
1398 // stats that persist lazily
1399 object_stat_collection_t unstable_stats;
1400
1401 // publish stats
1402 Mutex pg_stats_publish_lock;
1403 bool pg_stats_publish_valid;
1404 pg_stat_t pg_stats_publish;
1405
1406 void _update_calc_stats();
1407 void _update_blocked_by();
1408 friend class TestOpsSocketHook;
1409 void publish_stats_to_osd();
1410 void clear_publish_stats();
1411
1412 void clear_primary_state();
1413
1414 bool is_acting_recovery_backfill(pg_shard_t osd) const {
1415 return acting_recovery_backfill.count(osd);
1416 }
1417 bool is_acting(pg_shard_t osd) const {
1418 return has_shard(pool.info.is_erasure(), acting, osd);
1419 }
1420 bool is_up(pg_shard_t osd) const {
1421 return has_shard(pool.info.is_erasure(), up, osd);
1422 }
1423 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1424 if (ec) {
1425 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1426 } else {
1427 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1428 }
1429 }
1430
1431 bool needs_recovery() const;
1432 bool needs_backfill() const;
1433
1434 /// clip calculated priority to reasonable range
1435 int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
1436 /// get log recovery reservation priority
1437 unsigned get_recovery_priority();
1438 /// get backfill reservation priority
1439 unsigned get_backfill_priority();
1440 /// get priority for pg deletion
1441 unsigned get_delete_priority();
1442
1443 void try_mark_clean(); ///< mark an active pg clean
1444
1445 /// return [start,end) bounds for required past_intervals
1446 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1447 const pg_info_t &info,
1448 epoch_t oldest_map) {
1449 epoch_t start = std::max(
1450 info.history.last_epoch_clean ? info.history.last_epoch_clean :
1451 info.history.epoch_pool_created,
1452 oldest_map);
1453 epoch_t end = std::max(
1454 info.history.same_interval_since,
1455 info.history.epoch_pool_created);
1456 return make_pair(start, end);
1457 }
1458 void check_past_interval_bounds() const;
1459 PastIntervals::PriorSet build_prior();
1460
1461 void remove_down_peer_info(const OSDMapRef osdmap);
1462
1463 bool adjust_need_up_thru(const OSDMapRef osdmap);
1464
1465 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1466 virtual void dump_recovery_info(Formatter *f) const = 0;
1467
1468 void calc_min_last_complete_ondisk() {
1469 eversion_t min = last_complete_ondisk;
1470 ceph_assert(!acting_recovery_backfill.empty());
1471 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1472 i != acting_recovery_backfill.end();
1473 ++i) {
1474 if (*i == get_primary()) continue;
1475 if (peer_last_complete_ondisk.count(*i) == 0)
1476 return; // we don't have complete info
1477 eversion_t a = peer_last_complete_ondisk[*i];
1478 if (a < min)
1479 min = a;
1480 }
1481 if (min == min_last_complete_ondisk)
1482 return;
1483 min_last_complete_ondisk = min;
1484 return;
1485 }
1486
1487 virtual void calc_trim_to() = 0;
1488
1489 virtual void calc_trim_to_aggressive() = 0;
1490
1491 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1492 pg_missing_t& omissing, pg_shard_t from);
1493 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1494 pg_missing_t& omissing, pg_shard_t from);
1495 bool proc_replica_info(
1496 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1497
1498 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1499 PG *pg;
1500 ObjectStore::Transaction *t;
1501 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1502
1503 // LogEntryHandler
1504 void remove(const hobject_t &hoid) override {
1505 pg->get_pgbackend()->remove(hoid, t);
1506 }
1507 void try_stash(const hobject_t &hoid, version_t v) override {
1508 pg->get_pgbackend()->try_stash(hoid, v, t);
1509 }
1510 void rollback(const pg_log_entry_t &entry) override {
1511 ceph_assert(entry.can_rollback());
1512 pg->get_pgbackend()->rollback(entry, t);
1513 }
1514 void rollforward(const pg_log_entry_t &entry) override {
1515 pg->get_pgbackend()->rollforward(entry, t);
1516 }
1517 void trim(const pg_log_entry_t &entry) override {
1518 pg->get_pgbackend()->trim(entry, t);
1519 }
1520 };
1521
1522 void update_object_snap_mapping(
1523 ObjectStore::Transaction *t, const hobject_t &soid,
1524 const set<snapid_t> &snaps);
1525 void clear_object_snap_mapping(
1526 ObjectStore::Transaction *t, const hobject_t &soid);
1527 void remove_snap_mapped_object(
1528 ObjectStore::Transaction& t, const hobject_t& soid);
1529 void merge_log(
1530 ObjectStore::Transaction& t, pg_info_t &oinfo,
1531 pg_log_t &olog, pg_shard_t from);
1532 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1533 bool search_for_missing(
1534 const pg_info_t &oinfo, const pg_missing_t &omissing,
1535 pg_shard_t fromosd,
1536 RecoveryCtx*);
1537
1538 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1539
1540 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1541 const map<pg_shard_t, pg_info_t> &infos,
1542 bool restrict_to_up_acting,
1543 bool *history_les_bound) const;
1544 static void calc_ec_acting(
1545 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1546 unsigned size,
1547 const vector<int> &acting,
1548 const vector<int> &up,
1549 const map<pg_shard_t, pg_info_t> &all_info,
1550 bool restrict_to_up_acting,
1551 vector<int> *want,
1552 set<pg_shard_t> *backfill,
1553 set<pg_shard_t> *acting_backfill,
1554 ostream &ss);
1555 static void calc_replicated_acting(
1556 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1557 uint64_t force_auth_primary_missing_objects,
1558 unsigned size,
1559 const vector<int> &acting,
1560 const vector<int> &up,
1561 pg_shard_t up_primary,
1562 const map<pg_shard_t, pg_info_t> &all_info,
1563 bool restrict_to_up_acting,
1564 vector<int> *want,
1565 set<pg_shard_t> *backfill,
1566 set<pg_shard_t> *acting_backfill,
1567 const OSDMapRef osdmap,
1568 ostream &ss);
1569 void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1570 const pg_info_t &auth_info,
1571 vector<int> *want,
1572 set<pg_shard_t> *async_recovery,
1573 const OSDMapRef osdmap) const;
1574 void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1575 const pg_info_t &auth_info,
1576 vector<int> *want,
1577 set<pg_shard_t> *async_recovery,
1578 const OSDMapRef osdmap) const;
1579
1580 bool recoverable_and_ge_min_size(const vector<int> &want) const;
1581 bool choose_acting(pg_shard_t &auth_log_shard,
1582 bool restrict_to_up_acting,
1583 bool *history_les_bound);
1584 void build_might_have_unfound();
1585 void activate(
1586 ObjectStore::Transaction& t,
1587 epoch_t activation_epoch,
1588 map<int, map<spg_t,pg_query_t> >& query_map,
1589 map<int,
1590 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1591 RecoveryCtx *ctx);
1592
1593 struct C_PG_ActivateCommitted : public Context {
1594 PGRef pg;
1595 epoch_t epoch;
1596 epoch_t activation_epoch;
1597 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1598 : pg(p), epoch(e), activation_epoch(ae) {}
1599 void finish(int r) override {
1600 pg->_activate_committed(epoch, activation_epoch);
1601 }
1602 };
1603 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1604 void all_activated_and_committed();
1605
1606 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1607
1608 bool have_unfound() const {
1609 return missing_loc.have_unfound();
1610 }
1611 uint64_t get_num_unfound() const {
1612 return missing_loc.num_unfound();
1613 }
1614 bool all_missing_unfound() const {
1615 const auto& missing = pg_log.get_missing();
1616 if (!missing.have_missing())
1617 return false;
1618 for (auto& m : missing.get_items()) {
1619 if (!missing_loc.is_unfound(m.first))
1620 return false;
1621 }
1622 return true;
1623 }
1624
1625 virtual void check_local() = 0;
1626
1627 void purge_strays();
1628
1629 void update_heartbeat_peers();
1630
1631 Context *finish_sync_event;
1632
1633 Context *finish_recovery();
1634 void _finish_recovery(Context *c);
1635 struct C_PG_FinishRecovery : public Context {
1636 PGRef pg;
1637 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1638 void finish(int r) override {
1639 pg->_finish_recovery(this);
1640 }
1641 };
1642 void cancel_recovery();
1643 void clear_recovery_state();
1644 virtual void _clear_recovery_state() = 0;
1645 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1646 void start_recovery_op(const hobject_t& soid);
1647 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1648
1649 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1650
1651 friend class C_OSD_RepModify_Commit;
1652 friend class C_DeleteMore;
1653
1654 // -- backoff --
1655 Mutex backoff_lock; // orders inside Backoff::lock
1656 map<hobject_t,set<BackoffRef>> backoffs;
1657
1658 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1659 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1660 void release_backoffs(const hobject_t& o) {
1661 release_backoffs(o, o);
1662 }
1663 void clear_backoffs();
1664
1665 void add_pg_backoff(SessionRef s) {
1666 hobject_t begin = info.pgid.pgid.get_hobj_start();
1667 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1668 add_backoff(s, begin, end);
1669 }
1670 void release_pg_backoffs() {
1671 hobject_t begin = info.pgid.pgid.get_hobj_start();
1672 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1673 release_backoffs(begin, end);
1674 }
1675
1676 // -- scrub --
1677 public:
1678 struct Scrubber {
1679 Scrubber();
1680 ~Scrubber();
1681
1682 // metadata
1683 set<pg_shard_t> reserved_peers;
1684 bool reserved, reserve_failed;
1685 epoch_t epoch_start;
1686
1687 // common to both scrubs
1688 bool active;
1689 set<pg_shard_t> waiting_on_whom;
1690 int shallow_errors;
1691 int deep_errors;
1692 int fixed;
1693 ScrubMap primary_scrubmap;
1694 ScrubMapBuilder primary_scrubmap_pos;
1695 epoch_t replica_scrub_start = 0;
1696 ScrubMap replica_scrubmap;
1697 ScrubMapBuilder replica_scrubmap_pos;
1698 map<pg_shard_t, ScrubMap> received_maps;
1699 OpRequestRef active_rep_scrub;
1700 utime_t scrub_reg_stamp; // stamp we registered for
1701
1702 omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
1703
1704 // For async sleep
1705 bool sleeping = false;
1706 bool needs_sleep = true;
1707 utime_t sleep_start;
1708
1709 // flags to indicate explicitly requested scrubs (by admin)
1710 bool must_scrub, must_deep_scrub, must_repair;
1711
1712 // Priority to use for scrub scheduling
1713 unsigned priority = 0;
1714
1715 // this flag indicates whether we would like to do auto-repair of the PG or not
1716 bool auto_repair;
1717 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1718 bool check_repair;
1719 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1720 // we should deep scrub in order to auto repair
1721 bool deep_scrub_on_error;
1722
1723 // Maps from objects with errors to missing/inconsistent peers
1724 map<hobject_t, set<pg_shard_t>> missing;
1725 map<hobject_t, set<pg_shard_t>> inconsistent;
1726
1727 // Map from object with errors to good peers
1728 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1729
1730 // Cleaned map pending snap metadata scrub
1731 ScrubMap cleaned_meta_map;
1732
1733 void clean_meta_map(ScrubMap &for_meta_scrub) {
1734 if (end.is_max() ||
1735 cleaned_meta_map.objects.empty()) {
1736 cleaned_meta_map.swap(for_meta_scrub);
1737 } else {
1738 auto iter = cleaned_meta_map.objects.end();
1739 --iter; // not empty, see if clause
1740 auto begin = cleaned_meta_map.objects.begin();
1741 if (iter->first.has_snapset()) {
1742 ++iter;
1743 } else {
1744 while (iter != begin) {
1745 auto next = iter--;
1746 if (next->first.get_head() != iter->first.get_head()) {
1747 ++iter;
1748 break;
1749 }
1750 }
1751 }
1752 for_meta_scrub.objects.insert(begin, iter);
1753 cleaned_meta_map.objects.erase(begin, iter);
1754 }
1755 }
1756
1757 // digest updates which we are waiting on
1758 int num_digest_updates_pending;
1759
1760 // chunky scrub
1761 hobject_t start, end; // [start,end)
1762 hobject_t max_end; // Largest end that may have been sent to replicas
1763 eversion_t subset_last_update;
1764
1765 // chunky scrub state
1766 enum State {
1767 INACTIVE,
1768 NEW_CHUNK,
1769 WAIT_PUSHES,
1770 WAIT_LAST_UPDATE,
1771 BUILD_MAP,
1772 BUILD_MAP_DONE,
1773 WAIT_REPLICAS,
1774 COMPARE_MAPS,
1775 WAIT_DIGEST_UPDATES,
1776 FINISH,
1777 BUILD_MAP_REPLICA,
1778 } state;
1779
1780 std::unique_ptr<Scrub::Store> store;
1781 // deep scrub
1782 bool deep;
1783 int preempt_left;
1784 int preempt_divisor;
1785
1786 list<Context*> callbacks;
1787 void add_callback(Context *context) {
1788 callbacks.push_back(context);
1789 }
1790 void run_callbacks() {
1791 list<Context*> to_run;
1792 to_run.swap(callbacks);
1793 for (list<Context*>::iterator i = to_run.begin();
1794 i != to_run.end();
1795 ++i) {
1796 (*i)->complete(0);
1797 }
1798 }
1799
1800 static const char *state_string(const PG::Scrubber::State& state) {
1801 const char *ret = NULL;
1802 switch( state )
1803 {
1804 case INACTIVE: ret = "INACTIVE"; break;
1805 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1806 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1807 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1808 case BUILD_MAP: ret = "BUILD_MAP"; break;
1809 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1810 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1811 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1812 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1813 case FINISH: ret = "FINISH"; break;
1814 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1815 }
1816 return ret;
1817 }
1818
1819 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1820
1821 // clear all state
1822 void reset() {
1823 active = false;
1824 waiting_on_whom.clear();
1825 if (active_rep_scrub) {
1826 active_rep_scrub = OpRequestRef();
1827 }
1828 received_maps.clear();
1829
1830 must_scrub = false;
1831 must_deep_scrub = false;
1832 must_repair = false;
1833 auto_repair = false;
1834 check_repair = false;
1835 deep_scrub_on_error = false;
1836
1837 state = PG::Scrubber::INACTIVE;
1838 start = hobject_t();
1839 end = hobject_t();
1840 max_end = hobject_t();
1841 subset_last_update = eversion_t();
1842 shallow_errors = 0;
1843 deep_errors = 0;
1844 fixed = 0;
1845 omap_stats = (const struct omap_stat_t){ 0 };
1846 deep = false;
1847 run_callbacks();
1848 inconsistent.clear();
1849 missing.clear();
1850 authoritative.clear();
1851 num_digest_updates_pending = 0;
1852 primary_scrubmap = ScrubMap();
1853 primary_scrubmap_pos.reset();
1854 replica_scrubmap = ScrubMap();
1855 replica_scrubmap_pos.reset();
1856 cleaned_meta_map = ScrubMap();
1857 sleeping = false;
1858 needs_sleep = true;
1859 sleep_start = utime_t();
1860 }
1861
1862 void create_results(const hobject_t& obj);
1863 void cleanup_store(ObjectStore::Transaction *t);
1864 } scrubber;
1865
1866 protected:
1867 bool scrub_after_recovery;
1868
1869 int active_pushes;
1870
1871 bool scrub_can_preempt = false;
1872 bool scrub_preempted = false;
1873
1874 // we allow some number of preemptions of the scrub, which mean we do
1875 // not block. then we start to block. once we start blocking, we do
1876 // not stop until the scrub range is completed.
1877 bool write_blocked_by_scrub(const hobject_t &soid);
1878
1879 /// true if the given range intersects the scrub interval in any way
1880 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1881
1882 void repair_object(
1883 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1884 pg_shard_t bad_peer);
1885
1886 void chunky_scrub(ThreadPool::TPHandle &handle);
1887 void scrub_compare_maps();
1888 /**
1889 * return true if any inconsistency/missing is repaired, false otherwise
1890 */
1891 bool scrub_process_inconsistent();
1892 bool ops_blocked_by_scrub() const;
1893 void scrub_finish();
1894 void scrub_clear_state(bool keep_repair = false);
1895 void _scan_snaps(ScrubMap &map);
1896 void _repair_oinfo_oid(ScrubMap &map);
1897 void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
1898 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1899 hobject_t start, hobject_t end, bool deep,
1900 bool allow_preemption);
1901 int build_scrub_map_chunk(
1902 ScrubMap &map,
1903 ScrubMapBuilder &pos,
1904 hobject_t start, hobject_t end, bool deep,
1905 ThreadPool::TPHandle &handle);
1906 /**
1907 * returns true if [begin, end) is good to scrub at this time
1908 * a false return value obliges the implementer to requeue scrub when the
1909 * condition preventing scrub clears
1910 */
1911 virtual bool _range_available_for_scrub(
1912 const hobject_t &begin, const hobject_t &end) = 0;
1913 virtual void scrub_snapshot_metadata(
1914 ScrubMap &map,
1915 const std::map<hobject_t,
1916 pair<boost::optional<uint32_t>,
1917 boost::optional<uint32_t>>> &missing_digest) { }
1918 virtual void _scrub_clear_state() { }
1919 virtual void _scrub_finish() { }
1920 void clear_scrub_reserved();
1921 void scrub_reserve_replicas();
1922 void scrub_unreserve_replicas();
1923 bool scrub_all_replicas_reserved() const;
1924
1925 void replica_scrub(
1926 OpRequestRef op,
1927 ThreadPool::TPHandle &handle);
1928 void do_replica_scrub_map(OpRequestRef op);
1929
1930 void handle_scrub_reserve_request(OpRequestRef op);
1931 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1932 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1933 void handle_scrub_reserve_release(OpRequestRef op);
1934
1935 void reject_reservation();
1936 void schedule_backfill_retry(float retry);
1937 void schedule_recovery_retry(float retry);
1938
1939 // -- recovery state --
1940
1941 template <class EVT>
1942 struct QueuePeeringEvt : Context {
1943 PGRef pg;
1944 epoch_t epoch;
1945 EVT evt;
1946 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1947 pg(pg), epoch(epoch), evt(evt) {}
1948 void finish(int r) override {
1949 pg->lock();
1950 pg->queue_peering_event(PGPeeringEventRef(
1951 new PGPeeringEvent(
1952 epoch,
1953 epoch,
1954 evt)));
1955 pg->unlock();
1956 }
1957 };
1958
1959
1960 struct QueryState : boost::statechart::event< QueryState > {
1961 Formatter *f;
1962 explicit QueryState(Formatter *f) : f(f) {}
1963 void print(std::ostream *out) const {
1964 *out << "Query";
1965 }
1966 };
1967
1968 public:
1969 int pg_stat_adjust(osd_stat_t *new_stat);
1970 protected:
1971
1972 struct AdvMap : boost::statechart::event< AdvMap > {
1973 OSDMapRef osdmap;
1974 OSDMapRef lastmap;
1975 vector<int> newup, newacting;
1976 int up_primary, acting_primary;
1977 AdvMap(
1978 OSDMapRef osdmap, OSDMapRef lastmap,
1979 vector<int>& newup, int up_primary,
1980 vector<int>& newacting, int acting_primary):
1981 osdmap(osdmap), lastmap(lastmap),
1982 newup(newup),
1983 newacting(newacting),
1984 up_primary(up_primary),
1985 acting_primary(acting_primary) {}
1986 void print(std::ostream *out) const {
1987 *out << "AdvMap";
1988 }
1989 };
1990
1991 struct ActMap : boost::statechart::event< ActMap > {
1992 ActMap() : boost::statechart::event< ActMap >() {}
1993 void print(std::ostream *out) const {
1994 *out << "ActMap";
1995 }
1996 };
1997 struct Activate : boost::statechart::event< Activate > {
1998 epoch_t activation_epoch;
1999 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2000 activation_epoch(q) {}
2001 void print(std::ostream *out) const {
2002 *out << "Activate from " << activation_epoch;
2003 }
2004 };
2005 public:
2006 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2007 explicit UnfoundBackfill() {}
2008 void print(std::ostream *out) const {
2009 *out << "UnfoundBackfill";
2010 }
2011 };
2012 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2013 explicit UnfoundRecovery() {}
2014 void print(std::ostream *out) const {
2015 *out << "UnfoundRecovery";
2016 }
2017 };
2018
2019 struct RequestScrub : boost::statechart::event<RequestScrub> {
2020 bool deep;
2021 bool repair;
2022 explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2023 void print(std::ostream *out) const {
2024 *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2025 << (repair ? " repair" : "");
2026 }
2027 };
2028
2029 protected:
2030 TrivialEvent(Initialize)
2031 TrivialEvent(GotInfo)
2032 TrivialEvent(NeedUpThru)
2033 TrivialEvent(Backfilled)
2034 TrivialEvent(LocalBackfillReserved)
2035 TrivialEvent(RejectRemoteReservation)
2036 public:
2037 TrivialEvent(RequestBackfill)
2038 protected:
2039 TrivialEvent(RemoteRecoveryPreempted)
2040 TrivialEvent(RemoteBackfillPreempted)
2041 TrivialEvent(BackfillTooFull)
2042 TrivialEvent(RecoveryTooFull)
2043
2044 TrivialEvent(MakePrimary)
2045 TrivialEvent(MakeStray)
2046 TrivialEvent(NeedActingChange)
2047 TrivialEvent(IsIncomplete)
2048 TrivialEvent(IsDown)
2049
2050 TrivialEvent(AllReplicasRecovered)
2051 TrivialEvent(DoRecovery)
2052 TrivialEvent(LocalRecoveryReserved)
2053 public:
2054 protected:
2055 TrivialEvent(AllRemotesReserved)
2056 TrivialEvent(AllBackfillsReserved)
2057 TrivialEvent(GoClean)
2058
2059 TrivialEvent(AllReplicasActivated)
2060
2061 TrivialEvent(IntervalFlush)
2062
2063 public:
2064 TrivialEvent(DeleteStart)
2065 TrivialEvent(DeleteSome)
2066
2067 TrivialEvent(SetForceRecovery)
2068 TrivialEvent(UnsetForceRecovery)
2069 TrivialEvent(SetForceBackfill)
2070 TrivialEvent(UnsetForceBackfill)
2071
2072 protected:
2073 TrivialEvent(DeleteReserved)
2074 TrivialEvent(DeleteInterrupted)
2075
2076 /* Encapsulates PG recovery process */
2077 class RecoveryState {
2078 void start_handle(RecoveryCtx *new_ctx);
2079 void end_handle();
2080 public:
2081 void begin_block_outgoing();
2082 void end_block_outgoing();
2083 void clear_blocked_outgoing();
2084 private:
2085
2086 /* States */
2087 struct Initial;
2088 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2089 RecoveryState *state;
2090 public:
2091 PG *pg;
2092
2093 utime_t event_time;
2094 uint64_t event_count;
2095
2096 void clear_event_counters() {
2097 event_time = utime_t();
2098 event_count = 0;
2099 }
2100
2101 void log_enter(const char *state_name);
2102 void log_exit(const char *state_name, utime_t duration);
2103
2104 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2105
2106 /* Accessor functions for state methods */
2107 ObjectStore::Transaction* get_cur_transaction() {
2108 ceph_assert(state->rctx);
2109 ceph_assert(state->rctx->transaction);
2110 return state->rctx->transaction;
2111 }
2112
2113 void send_query(pg_shard_t to, const pg_query_t &query) {
2114 ceph_assert(state->rctx);
2115 ceph_assert(state->rctx->query_map);
2116 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2117 query;
2118 }
2119
2120 map<int, map<spg_t, pg_query_t> > *get_query_map() {
2121 ceph_assert(state->rctx);
2122 ceph_assert(state->rctx->query_map);
2123 return state->rctx->query_map;
2124 }
2125
2126 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
2127 ceph_assert(state->rctx);
2128 ceph_assert(state->rctx->info_map);
2129 return state->rctx->info_map;
2130 }
2131
2132 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2133
2134 void send_notify(pg_shard_t to,
2135 const pg_notify_t &info, const PastIntervals &pi) {
2136 ceph_assert(state->rctx);
2137 state->rctx->send_notify(to, info, pi);
2138 }
2139 };
2140 friend class RecoveryMachine;
2141
2142 /* States */
2143 // Initial
2144 // Reset
2145 // Start
2146 // Started
2147 // Primary
2148 // WaitActingChange
2149 // Peering
2150 // GetInfo
2151 // GetLog
2152 // GetMissing
2153 // WaitUpThru
2154 // Incomplete
2155 // Active
2156 // Activating
2157 // Clean
2158 // Recovered
2159 // Backfilling
2160 // WaitRemoteBackfillReserved
2161 // WaitLocalBackfillReserved
2162 // NotBackfilling
2163 // NotRecovering
2164 // Recovering
2165 // WaitRemoteRecoveryReserved
2166 // WaitLocalRecoveryReserved
2167 // ReplicaActive
2168 // RepNotRecovering
2169 // RepRecovering
2170 // RepWaitBackfillReserved
2171 // RepWaitRecoveryReserved
2172 // Stray
2173 // ToDelete
2174 // WaitDeleteReserved
2175 // Deleting
2176 // Crashed
2177
2178 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2179 explicit Crashed(my_context ctx);
2180 };
2181
2182 struct Reset;
2183
2184 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2185 explicit Initial(my_context ctx);
2186 void exit();
2187
2188 typedef boost::mpl::list <
2189 boost::statechart::transition< Initialize, Reset >,
2190 boost::statechart::custom_reaction< NullEvt >,
2191 boost::statechart::transition< boost::statechart::event_base, Crashed >
2192 > reactions;
2193
2194 boost::statechart::result react(const MNotifyRec&);
2195 boost::statechart::result react(const MInfoRec&);
2196 boost::statechart::result react(const MLogRec&);
2197 boost::statechart::result react(const boost::statechart::event_base&) {
2198 return discard_event();
2199 }
2200 };
2201
2202 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2203 explicit Reset(my_context ctx);
2204 void exit();
2205
2206 typedef boost::mpl::list <
2207 boost::statechart::custom_reaction< QueryState >,
2208 boost::statechart::custom_reaction< AdvMap >,
2209 boost::statechart::custom_reaction< ActMap >,
2210 boost::statechart::custom_reaction< NullEvt >,
2211 boost::statechart::custom_reaction< IntervalFlush >,
2212 boost::statechart::transition< boost::statechart::event_base, Crashed >
2213 > reactions;
2214 boost::statechart::result react(const QueryState& q);
2215 boost::statechart::result react(const AdvMap&);
2216 boost::statechart::result react(const ActMap&);
2217 boost::statechart::result react(const IntervalFlush&);
2218 boost::statechart::result react(const boost::statechart::event_base&) {
2219 return discard_event();
2220 }
2221 };
2222
2223 struct Start;
2224
2225 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2226 explicit Started(my_context ctx);
2227 void exit();
2228
2229 typedef boost::mpl::list <
2230 boost::statechart::custom_reaction< QueryState >,
2231 boost::statechart::custom_reaction< AdvMap >,
2232 boost::statechart::custom_reaction< IntervalFlush >,
2233 // ignored
2234 boost::statechart::custom_reaction< NullEvt >,
2235 boost::statechart::custom_reaction<SetForceRecovery>,
2236 boost::statechart::custom_reaction<UnsetForceRecovery>,
2237 boost::statechart::custom_reaction<SetForceBackfill>,
2238 boost::statechart::custom_reaction<UnsetForceBackfill>,
2239 boost::statechart::custom_reaction<RequestScrub>,
2240 // crash
2241 boost::statechart::transition< boost::statechart::event_base, Crashed >
2242 > reactions;
2243 boost::statechart::result react(const QueryState& q);
2244 boost::statechart::result react(const AdvMap&);
2245 boost::statechart::result react(const IntervalFlush&);
2246 boost::statechart::result react(const boost::statechart::event_base&) {
2247 return discard_event();
2248 }
2249 };
2250
2251 struct Primary;
2252 struct Stray;
2253
2254 struct Start : boost::statechart::state< Start, Started >, NamedState {
2255 explicit Start(my_context ctx);
2256 void exit();
2257
2258 typedef boost::mpl::list <
2259 boost::statechart::transition< MakePrimary, Primary >,
2260 boost::statechart::transition< MakeStray, Stray >
2261 > reactions;
2262 };
2263
2264 struct Peering;
2265 struct WaitActingChange;
2266 struct Incomplete;
2267 struct Down;
2268
2269 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2270 explicit Primary(my_context ctx);
2271 void exit();
2272
2273 typedef boost::mpl::list <
2274 boost::statechart::custom_reaction< ActMap >,
2275 boost::statechart::custom_reaction< MNotifyRec >,
2276 boost::statechart::transition< NeedActingChange, WaitActingChange >,
2277 boost::statechart::custom_reaction<SetForceRecovery>,
2278 boost::statechart::custom_reaction<UnsetForceRecovery>,
2279 boost::statechart::custom_reaction<SetForceBackfill>,
2280 boost::statechart::custom_reaction<UnsetForceBackfill>,
2281 boost::statechart::custom_reaction<RequestScrub>
2282 > reactions;
2283 boost::statechart::result react(const ActMap&);
2284 boost::statechart::result react(const MNotifyRec&);
2285 boost::statechart::result react(const SetForceRecovery&);
2286 boost::statechart::result react(const UnsetForceRecovery&);
2287 boost::statechart::result react(const SetForceBackfill&);
2288 boost::statechart::result react(const UnsetForceBackfill&);
2289 boost::statechart::result react(const RequestScrub&);
2290 };
2291
2292 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2293 NamedState {
2294 typedef boost::mpl::list <
2295 boost::statechart::custom_reaction< QueryState >,
2296 boost::statechart::custom_reaction< AdvMap >,
2297 boost::statechart::custom_reaction< MLogRec >,
2298 boost::statechart::custom_reaction< MInfoRec >,
2299 boost::statechart::custom_reaction< MNotifyRec >
2300 > reactions;
2301 explicit WaitActingChange(my_context ctx);
2302 boost::statechart::result react(const QueryState& q);
2303 boost::statechart::result react(const AdvMap&);
2304 boost::statechart::result react(const MLogRec&);
2305 boost::statechart::result react(const MInfoRec&);
2306 boost::statechart::result react(const MNotifyRec&);
2307 void exit();
2308 };
2309
2310 struct GetInfo;
2311 struct Active;
2312
2313 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2314 PastIntervals::PriorSet prior_set;
2315 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2316
2317 explicit Peering(my_context ctx);
2318 void exit();
2319
2320 typedef boost::mpl::list <
2321 boost::statechart::custom_reaction< QueryState >,
2322 boost::statechart::transition< Activate, Active >,
2323 boost::statechart::custom_reaction< AdvMap >
2324 > reactions;
2325 boost::statechart::result react(const QueryState& q);
2326 boost::statechart::result react(const AdvMap &advmap);
2327 };
2328
2329 struct WaitLocalRecoveryReserved;
2330 struct Activating;
2331 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2332 explicit Active(my_context ctx);
2333 void exit();
2334
2335 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2336 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2337 bool all_replicas_activated;
2338
2339 typedef boost::mpl::list <
2340 boost::statechart::custom_reaction< QueryState >,
2341 boost::statechart::custom_reaction< ActMap >,
2342 boost::statechart::custom_reaction< AdvMap >,
2343 boost::statechart::custom_reaction< MInfoRec >,
2344 boost::statechart::custom_reaction< MNotifyRec >,
2345 boost::statechart::custom_reaction< MLogRec >,
2346 boost::statechart::custom_reaction< MTrim >,
2347 boost::statechart::custom_reaction< Backfilled >,
2348 boost::statechart::custom_reaction< AllReplicasActivated >,
2349 boost::statechart::custom_reaction< DeferRecovery >,
2350 boost::statechart::custom_reaction< DeferBackfill >,
2351 boost::statechart::custom_reaction< UnfoundRecovery >,
2352 boost::statechart::custom_reaction< UnfoundBackfill >,
2353 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2354 boost::statechart::custom_reaction< RemoteReservationRevoked>,
2355 boost::statechart::custom_reaction< DoRecovery>
2356 > reactions;
2357 boost::statechart::result react(const QueryState& q);
2358 boost::statechart::result react(const ActMap&);
2359 boost::statechart::result react(const AdvMap&);
2360 boost::statechart::result react(const MInfoRec& infoevt);
2361 boost::statechart::result react(const MNotifyRec& notevt);
2362 boost::statechart::result react(const MLogRec& logevt);
2363 boost::statechart::result react(const MTrim& trimevt);
2364 boost::statechart::result react(const Backfilled&) {
2365 return discard_event();
2366 }
2367 boost::statechart::result react(const AllReplicasActivated&);
2368 boost::statechart::result react(const DeferRecovery& evt) {
2369 return discard_event();
2370 }
2371 boost::statechart::result react(const DeferBackfill& evt) {
2372 return discard_event();
2373 }
2374 boost::statechart::result react(const UnfoundRecovery& evt) {
2375 return discard_event();
2376 }
2377 boost::statechart::result react(const UnfoundBackfill& evt) {
2378 return discard_event();
2379 }
2380 boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2381 return discard_event();
2382 }
2383 boost::statechart::result react(const RemoteReservationRevoked&) {
2384 return discard_event();
2385 }
2386 boost::statechart::result react(const DoRecovery&) {
2387 return discard_event();
2388 }
2389 };
2390
2391 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2392 typedef boost::mpl::list<
2393 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2394 boost::statechart::custom_reaction<SetForceRecovery>,
2395 boost::statechart::custom_reaction<SetForceBackfill>
2396 > reactions;
2397 explicit Clean(my_context ctx);
2398 void exit();
2399 boost::statechart::result react(const boost::statechart::event_base&) {
2400 return discard_event();
2401 }
2402 };
2403
2404 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2405 typedef boost::mpl::list<
2406 boost::statechart::transition< GoClean, Clean >,
2407 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2408 boost::statechart::custom_reaction< AllReplicasActivated >
2409 > reactions;
2410 explicit Recovered(my_context ctx);
2411 void exit();
2412 boost::statechart::result react(const AllReplicasActivated&) {
2413 post_event(GoClean());
2414 return forward_event();
2415 }
2416 };
2417
2418 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2419 typedef boost::mpl::list<
2420 boost::statechart::custom_reaction< Backfilled >,
2421 boost::statechart::custom_reaction< DeferBackfill >,
2422 boost::statechart::custom_reaction< UnfoundBackfill >,
2423 boost::statechart::custom_reaction< RemoteReservationRejected >,
2424 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2425 boost::statechart::custom_reaction< RemoteReservationRevoked>
2426 > reactions;
2427 explicit Backfilling(my_context ctx);
2428 boost::statechart::result react(const RemoteReservationRejected& evt) {
2429 // for compat with old peers
2430 post_event(RemoteReservationRevokedTooFull());
2431 return discard_event();
2432 }
2433 void backfill_release_reservations();
2434 boost::statechart::result react(const Backfilled& evt);
2435 boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2436 boost::statechart::result react(const RemoteReservationRevoked& evt);
2437 boost::statechart::result react(const DeferBackfill& evt);
2438 boost::statechart::result react(const UnfoundBackfill& evt);
2439 void cancel_backfill();
2440 void exit();
2441 };
2442
2443 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2444 typedef boost::mpl::list<
2445 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2446 boost::statechart::custom_reaction< RemoteReservationRejected >,
2447 boost::statechart::custom_reaction< RemoteReservationRevoked >,
2448 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2449 > reactions;
2450 set<pg_shard_t>::const_iterator backfill_osd_it;
2451 explicit WaitRemoteBackfillReserved(my_context ctx);
2452 void retry();
2453 void exit();
2454 boost::statechart::result react(const RemoteBackfillReserved& evt);
2455 boost::statechart::result react(const RemoteReservationRejected& evt);
2456 boost::statechart::result react(const RemoteReservationRevoked& evt);
2457 };
2458
2459 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2460 typedef boost::mpl::list<
2461 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2462 > reactions;
2463 explicit WaitLocalBackfillReserved(my_context ctx);
2464 void exit();
2465 };
2466
2467 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2468 typedef boost::mpl::list<
2469 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2470 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2471 boost::statechart::custom_reaction< RemoteReservationRejected >
2472 > reactions;
2473 explicit NotBackfilling(my_context ctx);
2474 void exit();
2475 boost::statechart::result react(const RemoteBackfillReserved& evt);
2476 boost::statechart::result react(const RemoteReservationRejected& evt);
2477 };
2478
2479 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2480 typedef boost::mpl::list<
2481 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2482 boost::statechart::custom_reaction< DeferRecovery >,
2483 boost::statechart::custom_reaction< UnfoundRecovery >
2484 > reactions;
2485 explicit NotRecovering(my_context ctx);
2486 boost::statechart::result react(const DeferRecovery& evt) {
2487 /* no-op */
2488 return discard_event();
2489 }
2490 boost::statechart::result react(const UnfoundRecovery& evt) {
2491 /* no-op */
2492 return discard_event();
2493 }
2494 void exit();
2495 };
2496
2497 struct ToDelete;
2498 struct RepNotRecovering;
2499 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2500 explicit ReplicaActive(my_context ctx);
2501 void exit();
2502
2503 typedef boost::mpl::list <
2504 boost::statechart::custom_reaction< QueryState >,
2505 boost::statechart::custom_reaction< ActMap >,
2506 boost::statechart::custom_reaction< MQuery >,
2507 boost::statechart::custom_reaction< MInfoRec >,
2508 boost::statechart::custom_reaction< MLogRec >,
2509 boost::statechart::custom_reaction< MTrim >,
2510 boost::statechart::custom_reaction< Activate >,
2511 boost::statechart::custom_reaction< DeferRecovery >,
2512 boost::statechart::custom_reaction< DeferBackfill >,
2513 boost::statechart::custom_reaction< UnfoundRecovery >,
2514 boost::statechart::custom_reaction< UnfoundBackfill >,
2515 boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2516 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2517 boost::statechart::custom_reaction< RecoveryDone >,
2518 boost::statechart::transition<DeleteStart, ToDelete>
2519 > reactions;
2520 boost::statechart::result react(const QueryState& q);
2521 boost::statechart::result react(const MInfoRec& infoevt);
2522 boost::statechart::result react(const MLogRec& logevt);
2523 boost::statechart::result react(const MTrim& trimevt);
2524 boost::statechart::result react(const ActMap&);
2525 boost::statechart::result react(const MQuery&);
2526 boost::statechart::result react(const Activate&);
2527 boost::statechart::result react(const RecoveryDone&) {
2528 return discard_event();
2529 }
2530 boost::statechart::result react(const DeferRecovery& evt) {
2531 return discard_event();
2532 }
2533 boost::statechart::result react(const DeferBackfill& evt) {
2534 return discard_event();
2535 }
2536 boost::statechart::result react(const UnfoundRecovery& evt) {
2537 return discard_event();
2538 }
2539 boost::statechart::result react(const UnfoundBackfill& evt) {
2540 return discard_event();
2541 }
2542 boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2543 return discard_event();
2544 }
2545 boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2546 return discard_event();
2547 }
2548 };
2549
2550 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2551 typedef boost::mpl::list<
2552 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2553 // for compat with old peers
2554 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2555 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2556 boost::statechart::custom_reaction< BackfillTooFull >,
2557 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2558 boost::statechart::custom_reaction< RemoteBackfillPreempted >
2559 > reactions;
2560 explicit RepRecovering(my_context ctx);
2561 boost::statechart::result react(const RemoteRecoveryPreempted &evt);
2562 boost::statechart::result react(const BackfillTooFull &evt);
2563 boost::statechart::result react(const RemoteBackfillPreempted &evt);
2564 void exit();
2565 };
2566
2567 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2568 typedef boost::mpl::list<
2569 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2570 boost::statechart::custom_reaction< RejectRemoteReservation >,
2571 boost::statechart::custom_reaction< RemoteReservationRejected >,
2572 boost::statechart::custom_reaction< RemoteReservationCanceled >
2573 > reactions;
2574 explicit RepWaitBackfillReserved(my_context ctx);
2575 void exit();
2576 boost::statechart::result react(const RemoteBackfillReserved &evt);
2577 boost::statechart::result react(const RejectRemoteReservation &evt);
2578 boost::statechart::result react(const RemoteReservationRejected &evt);
2579 boost::statechart::result react(const RemoteReservationCanceled &evt);
2580 };
2581
2582 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2583 typedef boost::mpl::list<
2584 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2585 // for compat with old peers
2586 boost::statechart::custom_reaction< RemoteReservationRejected >,
2587 boost::statechart::custom_reaction< RemoteReservationCanceled >
2588 > reactions;
2589 explicit RepWaitRecoveryReserved(my_context ctx);
2590 void exit();
2591 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2592 boost::statechart::result react(const RemoteReservationRejected &evt) {
2593 // for compat with old peers
2594 post_event(RemoteReservationCanceled());
2595 return discard_event();
2596 }
2597 boost::statechart::result react(const RemoteReservationCanceled &evt);
2598 };
2599
2600 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2601 typedef boost::mpl::list<
2602 boost::statechart::custom_reaction< RequestRecoveryPrio >,
2603 boost::statechart::custom_reaction< RequestBackfillPrio >,
2604 boost::statechart::custom_reaction< RejectRemoteReservation >,
2605 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2606 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2607 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2608 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2609 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2610 > reactions;
2611 explicit RepNotRecovering(my_context ctx);
2612 boost::statechart::result react(const RequestRecoveryPrio &evt);
2613 boost::statechart::result react(const RequestBackfillPrio &evt);
2614 boost::statechart::result react(const RemoteBackfillReserved &evt) {
2615 // my reservation completion raced with a RELEASE from primary
2616 return discard_event();
2617 }
2618 boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2619 // my reservation completion raced with a RELEASE from primary
2620 return discard_event();
2621 }
2622 boost::statechart::result react(const RejectRemoteReservation &evt);
2623 void exit();
2624 };
2625
2626 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2627 typedef boost::mpl::list <
2628 boost::statechart::custom_reaction< AllReplicasRecovered >,
2629 boost::statechart::custom_reaction< DeferRecovery >,
2630 boost::statechart::custom_reaction< UnfoundRecovery >,
2631 boost::statechart::custom_reaction< RequestBackfill >
2632 > reactions;
2633 explicit Recovering(my_context ctx);
2634 void exit();
2635 void release_reservations(bool cancel = false);
2636 boost::statechart::result react(const AllReplicasRecovered &evt);
2637 boost::statechart::result react(const DeferRecovery& evt);
2638 boost::statechart::result react(const UnfoundRecovery& evt);
2639 boost::statechart::result react(const RequestBackfill &evt);
2640 };
2641
2642 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2643 typedef boost::mpl::list <
2644 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2645 boost::statechart::transition< AllRemotesReserved, Recovering >
2646 > reactions;
2647 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2648 explicit WaitRemoteRecoveryReserved(my_context ctx);
2649 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2650 void exit();
2651 };
2652
2653 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2654 typedef boost::mpl::list <
2655 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2656 boost::statechart::custom_reaction< RecoveryTooFull >
2657 > reactions;
2658 explicit WaitLocalRecoveryReserved(my_context ctx);
2659 void exit();
2660 boost::statechart::result react(const RecoveryTooFull &evt);
2661 };
2662
2663 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2664 typedef boost::mpl::list <
2665 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2666 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2667 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2668 > reactions;
2669 explicit Activating(my_context ctx);
2670 void exit();
2671 };
2672
2673 struct Stray : boost::statechart::state< Stray, Started >,
2674 NamedState {
2675 explicit Stray(my_context ctx);
2676 void exit();
2677
2678 typedef boost::mpl::list <
2679 boost::statechart::custom_reaction< MQuery >,
2680 boost::statechart::custom_reaction< MLogRec >,
2681 boost::statechart::custom_reaction< MInfoRec >,
2682 boost::statechart::custom_reaction< ActMap >,
2683 boost::statechart::custom_reaction< RecoveryDone >,
2684 boost::statechart::transition<DeleteStart, ToDelete>
2685 > reactions;
2686 boost::statechart::result react(const MQuery& query);
2687 boost::statechart::result react(const MLogRec& logevt);
2688 boost::statechart::result react(const MInfoRec& infoevt);
2689 boost::statechart::result react(const ActMap&);
2690 boost::statechart::result react(const RecoveryDone&) {
2691 return discard_event();
2692 }
2693 };
2694
2695 struct WaitDeleteReserved;
2696 struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2697 unsigned priority = 0;
2698 typedef boost::mpl::list <
2699 boost::statechart::custom_reaction< ActMap >,
2700 boost::statechart::custom_reaction< DeleteSome >
2701 > reactions;
2702 explicit ToDelete(my_context ctx);
2703 boost::statechart::result react(const ActMap &evt);
2704 boost::statechart::result react(const DeleteSome &evt) {
2705 // happens if we drop out of Deleting due to reprioritization etc.
2706 return discard_event();
2707 }
2708 void exit();
2709 };
2710
2711 struct Deleting;
2712 struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2713 ToDelete>, NamedState {
2714 typedef boost::mpl::list <
2715 boost::statechart::transition<DeleteReserved, Deleting>
2716 > reactions;
2717 explicit WaitDeleteReserved(my_context ctx);
2718 void exit();
2719 };
2720
2721 struct Deleting : boost::statechart::state<Deleting,
2722 ToDelete>, NamedState {
2723 typedef boost::mpl::list <
2724 boost::statechart::custom_reaction< DeleteSome >,
2725 boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2726 > reactions;
2727 explicit Deleting(my_context ctx);
2728 boost::statechart::result react(const DeleteSome &evt);
2729 void exit();
2730 };
2731
2732 struct GetLog;
2733
2734 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2735 set<pg_shard_t> peer_info_requested;
2736
2737 explicit GetInfo(my_context ctx);
2738 void exit();
2739 void get_infos();
2740
2741 typedef boost::mpl::list <
2742 boost::statechart::custom_reaction< QueryState >,
2743 boost::statechart::transition< GotInfo, GetLog >,
2744 boost::statechart::custom_reaction< MNotifyRec >,
2745 boost::statechart::transition< IsDown, Down >
2746 > reactions;
2747 boost::statechart::result react(const QueryState& q);
2748 boost::statechart::result react(const MNotifyRec& infoevt);
2749 };
2750
2751 struct GotLog : boost::statechart::event< GotLog > {
2752 GotLog() : boost::statechart::event< GotLog >() {}
2753 };
2754
2755 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2756 pg_shard_t auth_log_shard;
2757 boost::intrusive_ptr<MOSDPGLog> msg;
2758
2759 explicit GetLog(my_context ctx);
2760 void exit();
2761
2762 typedef boost::mpl::list <
2763 boost::statechart::custom_reaction< QueryState >,
2764 boost::statechart::custom_reaction< MLogRec >,
2765 boost::statechart::custom_reaction< GotLog >,
2766 boost::statechart::custom_reaction< AdvMap >,
2767 boost::statechart::transition< IsIncomplete, Incomplete >
2768 > reactions;
2769 boost::statechart::result react(const AdvMap&);
2770 boost::statechart::result react(const QueryState& q);
2771 boost::statechart::result react(const MLogRec& logevt);
2772 boost::statechart::result react(const GotLog&);
2773 };
2774
2775 struct WaitUpThru;
2776
2777 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2778 set<pg_shard_t> peer_missing_requested;
2779
2780 explicit GetMissing(my_context ctx);
2781 void exit();
2782
2783 typedef boost::mpl::list <
2784 boost::statechart::custom_reaction< QueryState >,
2785 boost::statechart::custom_reaction< MLogRec >,
2786 boost::statechart::transition< NeedUpThru, WaitUpThru >
2787 > reactions;
2788 boost::statechart::result react(const QueryState& q);
2789 boost::statechart::result react(const MLogRec& logevt);
2790 };
2791
2792 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2793 explicit WaitUpThru(my_context ctx);
2794 void exit();
2795
2796 typedef boost::mpl::list <
2797 boost::statechart::custom_reaction< QueryState >,
2798 boost::statechart::custom_reaction< ActMap >,
2799 boost::statechart::custom_reaction< MLogRec >
2800 > reactions;
2801 boost::statechart::result react(const QueryState& q);
2802 boost::statechart::result react(const ActMap& am);
2803 boost::statechart::result react(const MLogRec& logrec);
2804 };
2805
2806 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2807 explicit Down(my_context ctx);
2808 typedef boost::mpl::list <
2809 boost::statechart::custom_reaction< QueryState >,
2810 boost::statechart::custom_reaction< MNotifyRec >
2811 > reactions;
2812 boost::statechart::result react(const QueryState& q);
2813 boost::statechart::result react(const MNotifyRec& infoevt);
2814 void exit();
2815 };
2816
2817 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2818 typedef boost::mpl::list <
2819 boost::statechart::custom_reaction< AdvMap >,
2820 boost::statechart::custom_reaction< MNotifyRec >,
2821 boost::statechart::custom_reaction< QueryState >
2822 > reactions;
2823 explicit Incomplete(my_context ctx);
2824 boost::statechart::result react(const AdvMap &advmap);
2825 boost::statechart::result react(const MNotifyRec& infoevt);
2826 boost::statechart::result react(const QueryState& q);
2827 void exit();
2828 };
2829
2830 RecoveryMachine machine;
2831 PG *pg;
2832
2833 /// context passed in by state machine caller
2834 RecoveryCtx *orig_ctx;
2835
2836 /// populated if we are buffering messages pending a flush
2837 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2838
2839 /**
2840 * populated between start_handle() and end_handle(), points into
2841 * the message lists for messages_pending_flush while blocking messages
2842 * or into orig_ctx otherwise
2843 */
2844 boost::optional<RecoveryCtx> rctx;
2845
2846 public:
2847 explicit RecoveryState(PG *pg)
2848 : machine(this, pg), pg(pg), orig_ctx(0) {
2849 machine.initiate();
2850 }
2851
2852 void handle_event(const boost::statechart::event_base &evt,
2853 RecoveryCtx *rctx) {
2854 start_handle(rctx);
2855 machine.process_event(evt);
2856 end_handle();
2857 }
2858
2859 void handle_event(PGPeeringEventRef evt,
2860 RecoveryCtx *rctx) {
2861 start_handle(rctx);
2862 machine.process_event(evt->get_event());
2863 end_handle();
2864 }
2865
2866 } recovery_state;
2867
2868
2869
2870 uint64_t peer_features;
2871 uint64_t acting_features;
2872 uint64_t upacting_features;
2873
2874 epoch_t last_epoch;
2875
2876 /// most recently consumed osdmap's require_osd_version
2877 unsigned last_require_osd_release = 0;
2878 bool delete_needs_sleep = false;
2879
2880 protected:
2881 void reset_min_peer_features() {
2882 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2883 }
2884 uint64_t get_min_peer_features() const { return peer_features; }
2885 void apply_peer_features(uint64_t f) { peer_features &= f; }
2886
2887 uint64_t get_min_acting_features() const { return acting_features; }
2888 uint64_t get_min_upacting_features() const { return upacting_features; }
2889 bool perform_deletes_during_peering() const {
2890 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2891 }
2892
2893 bool hard_limit_pglog() const {
2894 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2895 }
2896
2897 void init_primary_up_acting(
2898 const vector<int> &newup,
2899 const vector<int> &newacting,
2900 int new_up_primary,
2901 int new_acting_primary) {
2902 actingset.clear();
2903 acting = newacting;
2904 for (uint8_t i = 0; i < acting.size(); ++i) {
2905 if (acting[i] != CRUSH_ITEM_NONE)
2906 actingset.insert(
2907 pg_shard_t(
2908 acting[i],
2909 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2910 }
2911 upset.clear();
2912 up = newup;
2913 for (uint8_t i = 0; i < up.size(); ++i) {
2914 if (up[i] != CRUSH_ITEM_NONE)
2915 upset.insert(
2916 pg_shard_t(
2917 up[i],
2918 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2919 }
2920 if (!pool.info.is_erasure()) {
2921 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2922 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2923 return;
2924 }
2925 up_primary = pg_shard_t();
2926 primary = pg_shard_t();
2927 for (uint8_t i = 0; i < up.size(); ++i) {
2928 if (up[i] == new_up_primary) {
2929 up_primary = pg_shard_t(up[i], shard_id_t(i));
2930 break;
2931 }
2932 }
2933 for (uint8_t i = 0; i < acting.size(); ++i) {
2934 if (acting[i] == new_acting_primary) {
2935 primary = pg_shard_t(acting[i], shard_id_t(i));
2936 break;
2937 }
2938 }
2939 ceph_assert(up_primary.osd == new_up_primary);
2940 ceph_assert(primary.osd == new_acting_primary);
2941 }
2942
2943 void set_role(int r) {
2944 role = r;
2945 }
2946
2947 bool state_test(uint64_t m) const { return (state & m) != 0; }
2948 void state_set(uint64_t m) { state |= m; }
2949 void state_clear(uint64_t m) { state &= ~m; }
2950
2951 bool is_complete() const { return info.last_complete == info.last_update; }
2952 bool should_send_notify() const { return send_notify; }
2953
2954 uint64_t get_state() const { return state; }
2955 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2956 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2957 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2958 bool is_down() const { return state_test(PG_STATE_DOWN); }
2959 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2960 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2961 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2962 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2963 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2964 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2965 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2966 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2967 bool is_peered() const {
2968 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2969 }
2970 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2971 bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2972 bool is_repair() const { return state_test(PG_STATE_REPAIR); }
2973
2974 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2975
2976 // pg on-disk state
2977 void do_pending_flush();
2978
2979 public:
2980 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2981 static void _init(ObjectStore::Transaction& t,
2982 spg_t pgid, const pg_pool_t *pool);
2983
2984 protected:
2985 void prepare_write_info(map<string,bufferlist> *km);
2986
2987 void update_store_with_options();
2988
2989 public:
2990 static int _prepare_write_info(
2991 CephContext* cct,
2992 map<string,bufferlist> *km,
2993 epoch_t epoch,
2994 pg_info_t &info,
2995 pg_info_t &last_written_info,
2996 PastIntervals &past_intervals,
2997 bool dirty_big_info,
2998 bool dirty_epoch,
2999 bool try_fast_info,
3000 PerfCounters *logger = nullptr);
3001
3002 void write_if_dirty(RecoveryCtx *rctx) {
3003 write_if_dirty(*rctx->transaction);
3004 }
3005 protected:
3006 void write_if_dirty(ObjectStore::Transaction& t);
3007
3008 PGLog::IndexedLog projected_log;
3009 bool check_in_progress_op(
3010 const osd_reqid_t &r,
3011 eversion_t *version,
3012 version_t *user_version,
3013 int *return_code) const;
3014 eversion_t projected_last_update;
3015 eversion_t get_next_version() const {
3016 eversion_t at_version(
3017 get_osdmap_epoch(),
3018 projected_last_update.version+1);
3019 ceph_assert(at_version > info.last_update);
3020 ceph_assert(at_version > pg_log.get_head());
3021 ceph_assert(at_version > projected_last_update);
3022 return at_version;
3023 }
3024
3025 void add_log_entry(const pg_log_entry_t& e, bool applied);
3026 void append_log(
3027 const vector<pg_log_entry_t>& logv,
3028 eversion_t trim_to,
3029 eversion_t roll_forward_to,
3030 ObjectStore::Transaction &t,
3031 bool transaction_applied = true,
3032 bool async = false);
3033 bool check_log_for_corruption(ObjectStore *store);
3034
3035 std::string get_corrupt_pg_log_name() const;
3036
3037 void update_snap_map(
3038 const vector<pg_log_entry_t> &log_entries,
3039 ObjectStore::Transaction& t);
3040
3041 void filter_snapc(vector<snapid_t> &snaps);
3042
3043 void log_weirdness();
3044
3045 virtual void kick_snap_trim() = 0;
3046 virtual void snap_trimmer_scrub_complete() = 0;
3047 bool requeue_scrub(bool high_priority = false);
3048 void queue_recovery();
3049 bool queue_scrub();
3050 unsigned get_scrub_priority();
3051
3052 /// share pg info after a pg is active
3053 void share_pg_info();
3054
3055
3056 bool append_log_entries_update_missing(
3057 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3058 ObjectStore::Transaction &t,
3059 boost::optional<eversion_t> trim_to,
3060 boost::optional<eversion_t> roll_forward_to);
3061
3062 /**
3063 * Merge entries updating missing as necessary on all
3064 * acting_recovery_backfill logs and missings (also missing_loc)
3065 */
3066 void merge_new_log_entries(
3067 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3068 ObjectStore::Transaction &t,
3069 boost::optional<eversion_t> trim_to,
3070 boost::optional<eversion_t> roll_forward_to);
3071
3072 void reset_interval_flush();
3073 void start_peering_interval(
3074 const OSDMapRef lastmap,
3075 const vector<int>& newup, int up_primary,
3076 const vector<int>& newacting, int acting_primary,
3077 ObjectStore::Transaction *t);
3078 void on_new_interval();
3079 virtual void _on_new_interval() = 0;
3080 void start_flush(ObjectStore::Transaction *t);
3081 void set_last_peering_reset();
3082
3083 void update_history(const pg_history_t& history);
3084 void fulfill_info(pg_shard_t from, const pg_query_t &query,
3085 pair<pg_shard_t, pg_info_t> &notify_info);
3086 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
3087 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
3088 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3089
3090 bool should_restart_peering(
3091 int newupprimary,
3092 int newactingprimary,
3093 const vector<int>& newup,
3094 const vector<int>& newacting,
3095 OSDMapRef lastmap,
3096 OSDMapRef osdmap);
3097
3098 // OpRequest queueing
3099 bool can_discard_op(OpRequestRef& op);
3100 bool can_discard_scan(OpRequestRef op);
3101 bool can_discard_backfill(OpRequestRef op);
3102 bool can_discard_request(OpRequestRef& op);
3103
3104 template<typename T, int MSGTYPE>
3105 bool can_discard_replica_op(OpRequestRef& op);
3106
3107 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
3108 bool old_peering_evt(PGPeeringEventRef evt) {
3109 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3110 }
3111 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3112 return e <= cur_epoch;
3113 }
3114 bool have_same_or_newer_map(epoch_t e) {
3115 return e <= get_osdmap_epoch();
3116 }
3117
3118 bool op_has_sufficient_caps(OpRequestRef& op);
3119
3120
3121 // recovery bits
3122 void take_waiters();
3123
3124
3125 // abstract bits
3126 friend class FlushState;
3127
3128 virtual void on_role_change() = 0;
3129 virtual void on_pool_change() = 0;
3130 virtual void on_change(ObjectStore::Transaction *t) = 0;
3131 virtual void on_activate() = 0;
3132 virtual void on_flushed() = 0;
3133 virtual void check_blacklisted_watchers() = 0;
3134
3135 friend ostream& operator<<(ostream& out, const PG& pg);
3136 };
3137
3138
3139 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3140
3141 #endif