]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PG.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / osd / PG.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_PG_H
16 #define CEPH_PG_H
17
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include <boost/container/flat_set.hpp>
28 #include "include/mempool.h"
29
30 // re-include our assert to clobber boost's
31 #include "include/ceph_assert.h"
32
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
38 #include "Session.h"
39 #include "common/Timer.h"
40
41 #include "PGLog.h"
42 #include "OSDMap.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
46 #include "PGPeeringEvent.h"
47
48 #include "mgr/OSDPerfMetricTypes.h"
49
50 #include <atomic>
51 #include <list>
52 #include <memory>
53 #include <stack>
54 #include <string>
55 #include <tuple>
56
57 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
58 //#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
59
60 class OSD;
61 class OSDService;
62 class OSDShard;
63 class OSDShardPGSlot;
64 class MOSDOp;
65 class MOSDPGScan;
66 class MOSDPGBackfill;
67 class MOSDPGInfo;
68
69 class PG;
70 struct OpRequest;
71 typedef OpRequest::Ref OpRequestRef;
72 class MOSDPGLog;
73 class CephContext;
74 class DynamicPerfStats;
75
76 namespace Scrub {
77 class Store;
78 }
79
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
82
83 struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105 };
106
107 class PGStateHistory {
108 // Member access protected with the PG lock
109 public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124 string get_current_state() {
125 if (pi == nullptr) return "unknown";
126 return std::get<1>(pi->embedded_states.top());
127 }
128
129 private:
130 bool pg_in_destructor = false;
131 PG* thispg = nullptr;
132 std::unique_ptr<PGStateInstance> tmppi;
133 PGStateInstance* pi = nullptr;
134 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
135
136 };
137
138 #ifdef PG_DEBUG_REFS
139 #include "common/tracked_int_ptr.hpp"
140 uint64_t get_with_id(PG *pg);
141 void put_with_id(PG *pg, uint64_t id);
142 typedef TrackedIntPtr<PG> PGRef;
143 #else
144 typedef boost::intrusive_ptr<PG> PGRef;
145 #endif
146
147 class PGRecoveryStats {
148 struct per_state_info {
149 uint64_t enter, exit; // enter/exit counts
150 uint64_t events;
151 utime_t event_time; // time spent processing events
152 utime_t total_time; // total time in state
153 utime_t min_time, max_time;
154
155 // cppcheck-suppress unreachableCode
156 per_state_info() : enter(0), exit(0), events(0) {}
157 };
158 map<const char *,per_state_info> info;
159 Mutex lock;
160
161 public:
162 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
163
164 void reset() {
165 std::lock_guard l(lock);
166 info.clear();
167 }
168 void dump(ostream& out) {
169 std::lock_guard l(lock);
170 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
171 per_state_info& i = p->second;
172 out << i.enter << "\t" << i.exit << "\t"
173 << i.events << "\t" << i.event_time << "\t"
174 << i.total_time << "\t"
175 << i.min_time << "\t" << i.max_time << "\t"
176 << p->first << "\n";
177 }
178 }
179
180 void dump_formatted(Formatter *f) {
181 std::lock_guard l(lock);
182 f->open_array_section("pg_recovery_stats");
183 for (map<const char *,per_state_info>::iterator p = info.begin();
184 p != info.end(); ++p) {
185 per_state_info& i = p->second;
186 f->open_object_section("recovery_state");
187 f->dump_int("enter", i.enter);
188 f->dump_int("exit", i.exit);
189 f->dump_int("events", i.events);
190 f->dump_stream("event_time") << i.event_time;
191 f->dump_stream("total_time") << i.total_time;
192 f->dump_stream("min_time") << i.min_time;
193 f->dump_stream("max_time") << i.max_time;
194 vector<string> states;
195 get_str_vec(p->first, "/", states);
196 f->open_array_section("nested_states");
197 for (vector<string>::iterator st = states.begin();
198 st != states.end(); ++st) {
199 f->dump_string("state", *st);
200 }
201 f->close_section();
202 f->close_section();
203 }
204 f->close_section();
205 }
206
207 void log_enter(const char *s) {
208 std::lock_guard l(lock);
209 info[s].enter++;
210 }
211 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
212 std::lock_guard l(lock);
213 per_state_info &i = info[s];
214 i.exit++;
215 i.total_time += dur;
216 if (dur > i.max_time)
217 i.max_time = dur;
218 if (dur < i.min_time || i.min_time == utime_t())
219 i.min_time = dur;
220 i.events += events;
221 i.event_time += event_dur;
222 }
223 };
224
225 struct PGPool {
226 CephContext* cct;
227 epoch_t cached_epoch;
228 int64_t id;
229 string name;
230
231 pg_pool_t info;
232 SnapContext snapc; // the default pool snapc, ready to go.
233
234 // these two sets are for < mimic only
235 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
236 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
237
238 PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
239 const string& name)
240 : cct(cct),
241 cached_epoch(map->get_epoch()),
242 id(i),
243 name(name),
244 info(info) {
245 snapc = info.get_snap_context();
246 if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
247 info.build_removed_snaps(cached_removed_snaps);
248 }
249 }
250
251 void update(CephContext *cct, OSDMapRef map);
252 };
253
254 /** PG - Replica Placement Group
255 *
256 */
257
258 class PG : public DoutPrefixProvider {
259 public:
260 // -- members --
261 const spg_t pg_id;
262 const coll_t coll;
263
264 ObjectStore::CollectionHandle ch;
265
266 struct RecoveryCtx;
267
268 // -- methods --
269 std::ostream& gen_prefix(std::ostream& out) const override;
270 CephContext *get_cct() const override {
271 return cct;
272 }
273 unsigned get_subsys() const override {
274 return ceph_subsys_osd;
275 }
276
277 const OSDMapRef& get_osdmap() const {
278 ceph_assert(is_locked());
279 ceph_assert(osdmap_ref);
280 return osdmap_ref;
281 }
282 epoch_t get_osdmap_epoch() const {
283 return osdmap_ref->get_epoch();
284 }
285
286 void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
287 handle.suspend_tp_timeout();
288 lock();
289 handle.reset_tp_timeout();
290 }
291 void lock(bool no_lockdep = false) const;
292 void unlock() const {
293 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
294 ceph_assert(!dirty_info);
295 ceph_assert(!dirty_big_info);
296 _lock.Unlock();
297 }
298 bool is_locked() const {
299 return _lock.is_locked();
300 }
301
302 const spg_t& get_pgid() const {
303 return pg_id;
304 }
305
306 const PGPool& get_pool() const {
307 return pool;
308 }
309 uint64_t get_last_user_version() const {
310 return info.last_user_version;
311 }
312 const pg_history_t& get_history() const {
313 return info.history;
314 }
315 bool get_need_up_thru() const {
316 return need_up_thru;
317 }
318 epoch_t get_same_interval_since() const {
319 return info.history.same_interval_since;
320 }
321
322 void set_last_scrub_stamp(utime_t t) {
323 info.stats.last_scrub_stamp = t;
324 info.history.last_scrub_stamp = t;
325 }
326
327 void set_last_deep_scrub_stamp(utime_t t) {
328 info.stats.last_deep_scrub_stamp = t;
329 info.history.last_deep_scrub_stamp = t;
330 }
331
332 bool is_deleting() const {
333 return deleting;
334 }
335 bool is_deleted() const {
336 return deleted;
337 }
338 bool is_replica() const {
339 return role > 0;
340 }
341 bool is_primary() const {
342 return pg_whoami == primary;
343 }
344 bool pg_has_reset_since(epoch_t e) {
345 ceph_assert(is_locked());
346 return deleted || e < get_last_peering_reset();
347 }
348
349 bool is_ec_pg() const {
350 return pool.info.is_erasure();
351 }
352 int get_role() const {
353 return role;
354 }
355 const vector<int> get_acting() const {
356 return acting;
357 }
358 int get_acting_primary() const {
359 return primary.osd;
360 }
361 pg_shard_t get_primary() const {
362 return primary;
363 }
364 const vector<int> get_up() const {
365 return up;
366 }
367 int get_up_primary() const {
368 return up_primary.osd;
369 }
370 const PastIntervals& get_past_intervals() const {
371 return past_intervals;
372 }
373
374 /// initialize created PG
375 void init(
376 int role,
377 const vector<int>& up,
378 int up_primary,
379 const vector<int>& acting,
380 int acting_primary,
381 const pg_history_t& history,
382 const PastIntervals& pim,
383 bool backfill,
384 ObjectStore::Transaction *t);
385
386 /// read existing pg state off disk
387 void read_state(ObjectStore *store);
388 static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
389
390 static int get_latest_struct_v() {
391 return latest_struct_v;
392 }
393 static int get_compat_struct_v() {
394 return compat_struct_v;
395 }
396 static int read_info(
397 ObjectStore *store, spg_t pgid, const coll_t &coll,
398 pg_info_t &info, PastIntervals &past_intervals,
399 __u8 &);
400 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
401
402 void rm_backoff(BackoffRef b);
403
404 void update_snap_mapper_bits(uint32_t bits) {
405 snap_mapper.update_bits(bits);
406 }
407 void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
408 virtual void split_colls(
409 spg_t child,
410 int split_bits,
411 int seed,
412 const pg_pool_t *pool,
413 ObjectStore::Transaction *t) = 0;
414 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
415 void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
416 unsigned split_bits,
417 const pg_merge_meta_t& last_pg_merge_meta);
418 void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
419
420 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
421
422 bool is_scrub_registered();
423 void reg_next_scrub();
424 void unreg_next_scrub();
425
426 void on_info_history_change();
427
428 void scrub_requested(bool deep, bool repair, bool need_auto = false);
429
430 bool is_forced_recovery_or_backfill() const {
431 return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
432 }
433 bool set_force_recovery(bool b);
434 bool set_force_backfill(bool b);
435
436 void queue_peering_event(PGPeeringEventRef evt);
437 void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
438 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
439 void queue_flushed(epoch_t started_at);
440 void handle_advance_map(
441 OSDMapRef osdmap, OSDMapRef lastmap,
442 vector<int>& newup, int up_primary,
443 vector<int>& newacting, int acting_primary,
444 RecoveryCtx *rctx);
445 void handle_activate_map(RecoveryCtx *rctx);
446 void handle_initialize(RecoveryCtx *rctx);
447 void handle_query_state(Formatter *f);
448
449 /**
450 * @param ops_begun returns how many recovery ops the function started
451 * @returns true if any useful work was accomplished; false otherwise
452 */
453 virtual bool start_recovery_ops(
454 uint64_t max,
455 ThreadPool::TPHandle &handle,
456 uint64_t *ops_begun) = 0;
457
458 // more work after the above, but with a RecoveryCtx
459 void find_unfound(epoch_t queued, RecoveryCtx *rctx);
460
461 virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
462
463 void dump_pgstate_history(Formatter *f);
464 void dump_missing(Formatter *f);
465
466 void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
467 void with_heartbeat_peers(std::function<void(int)> f);
468
469 void shutdown();
470 virtual void on_shutdown() = 0;
471
472 bool get_must_scrub() const {
473 return scrubber.must_scrub;
474 }
475 bool sched_scrub();
476
477 virtual void do_request(
478 OpRequestRef& op,
479 ThreadPool::TPHandle &handle
480 ) = 0;
481 virtual void clear_cache() = 0;
482 virtual int get_cache_obj_count() = 0;
483
484 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
485 virtual int do_command(
486 cmdmap_t cmdmap,
487 ostream& ss,
488 bufferlist& idata,
489 bufferlist& odata,
490 ConnectionRef conn,
491 ceph_tid_t tid) = 0;
492
493 virtual bool agent_work(int max) = 0;
494 virtual bool agent_work(int max, int agent_flush_quota) = 0;
495 virtual void agent_stop() = 0;
496 virtual void agent_delay() = 0;
497 virtual void agent_clear() = 0;
498 virtual void agent_choose_mode_restart() = 0;
499
500 virtual void on_removal(ObjectStore::Transaction *t) = 0;
501
502 void _delete_some(ObjectStore::Transaction *t);
503
504 virtual void set_dynamic_perf_stats_queries(
505 const std::list<OSDPerfMetricQuery> &queries) {
506 }
507 virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
508 }
509
510 // reference counting
511 #ifdef PG_DEBUG_REFS
512 uint64_t get_with_id();
513 void put_with_id(uint64_t);
514 void dump_live_ids();
515 #endif
516 void get(const char* tag);
517 void put(const char* tag);
518 int get_num_ref() {
519 return ref;
520 }
521
522 // ctor
523 PG(OSDService *o, OSDMapRef curmap,
524 const PGPool &pool, spg_t p);
525 ~PG() override;
526
527 // prevent copying
528 explicit PG(const PG& rhs) = delete;
529 PG& operator=(const PG& rhs) = delete;
530
531 protected:
532 // -------------
533 // protected
534 OSDService *osd;
535 public:
536 OSDShard *osd_shard = nullptr;
537 OSDShardPGSlot *pg_slot = nullptr;
538 protected:
539 CephContext *cct;
540
541 // osdmap
542 OSDMapRef osdmap_ref;
543
544 PGPool pool;
545
546 // locking and reference counting.
547 // I destroy myself when the reference count hits zero.
548 // lock() should be called before doing anything.
549 // get() should be called on pointer copy (to another thread, etc.).
550 // put() should be called on destruction of some previously copied pointer.
551 // unlock() when done with the current pointer (_most common_).
552 mutable Mutex _lock = {"PG::_lock"};
553
554 std::atomic<unsigned int> ref{0};
555
556 #ifdef PG_DEBUG_REFS
557 Mutex _ref_id_lock = {"PG::_ref_id_lock"};
558 map<uint64_t, string> _live_ids;
559 map<string, uint64_t> _tag_counts;
560 uint64_t _ref_id = 0;
561
562 friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
563 friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
564 #endif
565
566 private:
567 friend void intrusive_ptr_add_ref(PG *pg) {
568 pg->get("intptr");
569 }
570 friend void intrusive_ptr_release(PG *pg) {
571 pg->put("intptr");
572 }
573
574
575 // =====================
576
577 protected:
578 OSDriver osdriver;
579 SnapMapper snap_mapper;
580 bool eio_errors_to_process = false;
581
582 virtual PGBackend *get_pgbackend() = 0;
583 virtual const PGBackend* get_pgbackend() const = 0;
584
585 protected:
586 /*** PG ****/
587 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
588 IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
589 return get_pgbackend()->get_is_recoverable_predicate();
590 }
591 protected:
592 epoch_t last_persisted_osdmap;
593
594 void requeue_map_waiters();
595
596 void update_osdmap_ref(OSDMapRef newmap) {
597 ceph_assert(_lock.is_locked_by_me());
598 osdmap_ref = std::move(newmap);
599 }
600
601 protected:
602
603
604 bool deleting; // true while in removing or OSD is shutting down
605 atomic<bool> deleted = {false};
606
607 ZTracer::Endpoint trace_endpoint;
608
609
610 protected:
611 bool dirty_info, dirty_big_info;
612
613 protected:
614 // pg state
615 pg_info_t info; ///< current pg info
616 pg_info_t last_written_info; ///< last written info
617 __u8 info_struct_v = 0;
618 static const __u8 latest_struct_v = 10;
619 // v10 is the new past_intervals encoding
620 // v9 was fastinfo_key addition
621 // v8 was the move to a per-pg pgmeta object
622 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
623 // (first appeared in cuttlefish).
624 static const __u8 compat_struct_v = 10;
625 void upgrade(ObjectStore *store);
626
627 protected:
628 PGLog pg_log;
629 ghobject_t pgmeta_oid;
630
631 // ------------------
632 // MissingLoc
633
634 class MissingLoc {
635 public:
636 // a loc_count indicates how many locations we know in each of
637 // these distinct sets
638 struct loc_count_t {
639 int up = 0; //< up
640 int other = 0; //< other
641
642 friend bool operator<(const loc_count_t& l,
643 const loc_count_t& r) {
644 return (l.up < r.up ||
645 (l.up == r.up &&
646 (l.other < r.other)));
647 }
648 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
649 ceph_assert(l.up >= 0);
650 ceph_assert(l.other >= 0);
651 return out << "(" << l.up << "+" << l.other << ")";
652 }
653 };
654
655
656 private:
657
658 loc_count_t _get_count(const set<pg_shard_t>& shards) {
659 loc_count_t r;
660 for (auto s : shards) {
661 if (pg->upset.count(s)) {
662 r.up++;
663 } else {
664 r.other++;
665 }
666 }
667 return r;
668 }
669
670 map<hobject_t, pg_missing_item> needs_recovery_map;
671 map<hobject_t, set<pg_shard_t> > missing_loc;
672 set<pg_shard_t> missing_loc_sources;
673
674 // for every entry in missing_loc, we count how many of each type of shard we have,
675 // and maintain totals here. The sum of the values for this map will always equal
676 // missing_loc.size().
677 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
678
679 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
680 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
681 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
682 // For completely missing shards initialize with empty set<pg_shard_t>
683 for (int i = 0 ; i < num_shards ; ++i) {
684 shard_id_t shard(i);
685 pgsbs[shard];
686 }
687 for (auto pgs: s)
688 pgsbs[pgs.shard].insert(pgs);
689 } else {
690 pgsbs[shard_id_t::NO_SHARD] = s;
691 }
692 }
693
694 void _inc_count(const set<pg_shard_t>& s) {
695 map< shard_id_t, set<pg_shard_t> > pgsbs;
696 pgs_by_shard_id(s, pgsbs);
697 for (auto shard: pgsbs)
698 ++missing_by_count[shard.first][_get_count(shard.second)];
699 }
700 void _dec_count(const set<pg_shard_t>& s) {
701 map< shard_id_t, set<pg_shard_t> > pgsbs;
702 pgs_by_shard_id(s, pgsbs);
703 for (auto shard: pgsbs) {
704 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
705 ceph_assert(p != missing_by_count[shard.first].end());
706 if (--p->second == 0) {
707 missing_by_count[shard.first].erase(p);
708 }
709 }
710 }
711
712 PG *pg;
713 set<pg_shard_t> empty_set;
714 public:
715 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
716 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
717 explicit MissingLoc(PG *pg)
718 : pg(pg) { }
719 void set_backend_predicates(
720 IsPGReadablePredicate *_is_readable,
721 IsPGRecoverablePredicate *_is_recoverable) {
722 is_readable.reset(_is_readable);
723 is_recoverable.reset(_is_recoverable);
724 }
725 std::ostream& gen_prefix(std::ostream& out) const {
726 return pg->gen_prefix(out);
727 }
728 bool needs_recovery(
729 const hobject_t &hoid,
730 eversion_t *v = 0) const {
731 map<hobject_t, pg_missing_item>::const_iterator i =
732 needs_recovery_map.find(hoid);
733 if (i == needs_recovery_map.end())
734 return false;
735 if (v)
736 *v = i->second.need;
737 return true;
738 }
739 bool is_deleted(const hobject_t &hoid) const {
740 auto i = needs_recovery_map.find(hoid);
741 if (i == needs_recovery_map.end())
742 return false;
743 return i->second.is_delete();
744 }
745 bool is_unfound(const hobject_t &hoid) const {
746 auto it = needs_recovery_map.find(hoid);
747 if (it == needs_recovery_map.end()) {
748 return false;
749 }
750 if (it->second.is_delete()) {
751 return false;
752 }
753 auto mit = missing_loc.find(hoid);
754 return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
755 }
756 bool readable_with_acting(
757 const hobject_t &hoid,
758 const set<pg_shard_t> &acting) const;
759 uint64_t num_unfound() const {
760 uint64_t ret = 0;
761 for (map<hobject_t, pg_missing_item>::const_iterator i =
762 needs_recovery_map.begin();
763 i != needs_recovery_map.end();
764 ++i) {
765 if (i->second.is_delete())
766 continue;
767 auto mi = missing_loc.find(i->first);
768 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
769 ++ret;
770 }
771 return ret;
772 }
773
774 bool have_unfound() const {
775 for (map<hobject_t, pg_missing_item>::const_iterator i =
776 needs_recovery_map.begin();
777 i != needs_recovery_map.end();
778 ++i) {
779 if (i->second.is_delete())
780 continue;
781 auto mi = missing_loc.find(i->first);
782 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
783 return true;
784 }
785 return false;
786 }
787 void clear() {
788 needs_recovery_map.clear();
789 missing_loc.clear();
790 missing_loc_sources.clear();
791 missing_by_count.clear();
792 }
793
794 void add_location(const hobject_t &hoid, pg_shard_t location) {
795 auto p = missing_loc.find(hoid);
796 if (p == missing_loc.end()) {
797 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
798 } else {
799 _dec_count(p->second);
800 }
801 p->second.insert(location);
802 _inc_count(p->second);
803 }
804 void remove_location(const hobject_t &hoid, pg_shard_t location) {
805 auto p = missing_loc.find(hoid);
806 if (p != missing_loc.end()) {
807 _dec_count(p->second);
808 p->second.erase(location);
809 if (p->second.empty()) {
810 missing_loc.erase(p);
811 } else {
812 _inc_count(p->second);
813 }
814 }
815 }
816
817 void clear_location(const hobject_t &hoid) {
818 auto p = missing_loc.find(hoid);
819 if (p != missing_loc.end()) {
820 _dec_count(p->second);
821 missing_loc.erase(p);
822 }
823 }
824
825 void add_active_missing(const pg_missing_t &missing) {
826 for (map<hobject_t, pg_missing_item>::const_iterator i =
827 missing.get_items().begin();
828 i != missing.get_items().end();
829 ++i) {
830 map<hobject_t, pg_missing_item>::const_iterator j =
831 needs_recovery_map.find(i->first);
832 if (j == needs_recovery_map.end()) {
833 needs_recovery_map.insert(*i);
834 } else {
835 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
836 << i->first << " have " << j->second
837 << " tried to add " << i->second << dendl;
838 ceph_assert(i->second.need == j->second.need);
839 }
840 }
841 }
842
843 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
844 needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
845 }
846 void revise_need(const hobject_t &hoid, eversion_t need) {
847 auto it = needs_recovery_map.find(hoid);
848 ceph_assert(it != needs_recovery_map.end());
849 it->second.need = need;
850 }
851
852 /// Adds info about a possible recovery source
853 bool add_source_info(
854 pg_shard_t source, ///< [in] source
855 const pg_info_t &oinfo, ///< [in] info
856 const pg_missing_t &omissing, ///< [in] (optional) missing
857 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
858 ); ///< @return whether a new object location was discovered
859
860 /// Adds recovery sources in batch
861 void add_batch_sources_info(
862 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
863 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
864 );
865
866 /// Uses osdmap to update structures for now down sources
867 void check_recovery_sources(const OSDMapRef& osdmap);
868
869 /// Call when hoid is no longer missing in acting set
870 void recovered(const hobject_t &hoid) {
871 needs_recovery_map.erase(hoid);
872 auto p = missing_loc.find(hoid);
873 if (p != missing_loc.end()) {
874 _dec_count(p->second);
875 missing_loc.erase(p);
876 }
877 }
878
879 /// Call to update structures for hoid after a change
880 void rebuild(
881 const hobject_t &hoid,
882 pg_shard_t self,
883 const set<pg_shard_t> to_recover,
884 const pg_info_t &info,
885 const pg_missing_t &missing,
886 const map<pg_shard_t, pg_missing_t> &pmissing,
887 const map<pg_shard_t, pg_info_t> &pinfo) {
888 recovered(hoid);
889 boost::optional<pg_missing_item> item;
890 auto miter = missing.get_items().find(hoid);
891 if (miter != missing.get_items().end()) {
892 item = miter->second;
893 } else {
894 for (auto &&i: to_recover) {
895 if (i == self)
896 continue;
897 auto pmiter = pmissing.find(i);
898 ceph_assert(pmiter != pmissing.end());
899 miter = pmiter->second.get_items().find(hoid);
900 if (miter != pmiter->second.get_items().end()) {
901 item = miter->second;
902 break;
903 }
904 }
905 }
906 if (!item)
907 return; // recovered!
908
909 needs_recovery_map[hoid] = *item;
910 if (item->is_delete())
911 return;
912 auto mliter =
913 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
914 ceph_assert(info.last_backfill.is_max());
915 ceph_assert(info.last_update >= item->need);
916 if (!missing.is_missing(hoid))
917 mliter->second.insert(self);
918 for (auto &&i: pmissing) {
919 if (i.first == self)
920 continue;
921 auto pinfoiter = pinfo.find(i.first);
922 ceph_assert(pinfoiter != pinfo.end());
923 if (item->need <= pinfoiter->second.last_update &&
924 hoid <= pinfoiter->second.last_backfill &&
925 !i.second.is_missing(hoid))
926 mliter->second.insert(i.first);
927 }
928 _inc_count(mliter->second);
929 }
930
931 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
932 auto it = missing_loc.find(hoid);
933 return it == missing_loc.end() ? empty_set : it->second;
934 }
935 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
936 return missing_loc;
937 }
938 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
939 return needs_recovery_map;
940 }
941 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
942 return missing_by_count;
943 }
944 } missing_loc;
945
946 PastIntervals past_intervals;
947
948 interval_set<snapid_t> snap_trimq;
949
950 /* You should not use these items without taking their respective queue locks
951 * (if they have one) */
952 xlist<PG*>::item stat_queue_item;
953 bool scrub_queued;
954 bool recovery_queued;
955
956 int recovery_ops_active;
957 set<pg_shard_t> waiting_on_backfill;
958 #ifdef DEBUG_RECOVERY_OIDS
959 multiset<hobject_t> recovering_oids;
960 #endif
961
962 protected:
963 int role; // 0 = primary, 1 = replica, -1=none.
964 uint64_t state; // PG_STATE_*
965
966 bool send_notify; ///< true if we are non-primary and should notify the primary
967
968 protected:
969 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
970 eversion_t last_complete_ondisk; // last_complete that has committed.
971 eversion_t last_update_applied;
972
973 // entries <= last_rollback_info_trimmed_to_applied have been trimmed
974 eversion_t last_rollback_info_trimmed_to_applied;
975
976 // primary state
977 protected:
978 pg_shard_t primary;
979 pg_shard_t pg_whoami;
980 pg_shard_t up_primary;
981 vector<int> up, acting, want_acting;
982 // acting_recovery_backfill contains shards that are acting,
983 // async recovery targets, or backfill targets.
984 set<pg_shard_t> acting_recovery_backfill, actingset, upset;
985 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
986 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
987 eversion_t pg_trim_to;
988
989 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
990
991 protected:
992 // [primary only] content recovery state
993 struct BufferedRecoveryMessages {
994 map<int, map<spg_t, pg_query_t> > query_map;
995 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
996 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
997 };
998
999 public:
1000 bool dne() { return info.dne(); }
1001 struct RecoveryCtx {
1002 utime_t start_time;
1003 map<int, map<spg_t, pg_query_t> > *query_map;
1004 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
1005 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
1006 ObjectStore::Transaction *transaction;
1007 ThreadPool::TPHandle* handle;
1008 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1009 map<int,
1010 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1011 map<int,
1012 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
1013 ObjectStore::Transaction *transaction)
1014 : query_map(query_map), info_map(info_map),
1015 notify_list(notify_list),
1016 transaction(transaction),
1017 handle(NULL) {}
1018
1019 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1020 : query_map(&(buf.query_map)),
1021 info_map(&(buf.info_map)),
1022 notify_list(&(buf.notify_list)),
1023 transaction(rctx.transaction),
1024 handle(rctx.handle) {}
1025
1026 void accept_buffered_messages(BufferedRecoveryMessages &m) {
1027 ceph_assert(query_map);
1028 ceph_assert(info_map);
1029 ceph_assert(notify_list);
1030 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1031 i != m.query_map.end();
1032 ++i) {
1033 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1034 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1035 j != i->second.end();
1036 ++j) {
1037 omap[j->first] = j->second;
1038 }
1039 }
1040 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1041 = m.info_map.begin();
1042 i != m.info_map.end();
1043 ++i) {
1044 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1045 (*info_map)[i->first];
1046 ovec.reserve(ovec.size() + i->second.size());
1047 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1048 }
1049 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1050 = m.notify_list.begin();
1051 i != m.notify_list.end();
1052 ++i) {
1053 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1054 (*notify_list)[i->first];
1055 ovec.reserve(ovec.size() + i->second.size());
1056 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1057 }
1058 }
1059
1060 void send_notify(pg_shard_t to,
1061 const pg_notify_t &info, const PastIntervals &pi) {
1062 ceph_assert(notify_list);
1063 (*notify_list)[to.osd].push_back(make_pair(info, pi));
1064 }
1065 };
1066 protected:
1067
1068 PGStateHistory pgstate_history;
1069
1070 struct NamedState {
1071 const char *state_name;
1072 utime_t enter_time;
1073 PG* pg;
1074 const char *get_state_name() { return state_name; }
1075 NamedState(PG *pg_, const char *state_name_)
1076 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1077 pg->pgstate_history.enter(pg, enter_time, state_name);
1078 }
1079 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1080 };
1081
1082
1083
1084 protected:
1085
1086 /*
1087 * peer_info -- projected (updates _before_ replicas ack)
1088 * peer_missing -- committed (updates _after_ replicas ack)
1089 */
1090
1091 bool need_up_thru;
1092 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
1093 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
1094 map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
1095 set<pg_shard_t> peer_purged; // peers purged
1096 map<pg_shard_t, pg_missing_t> peer_missing;
1097 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
1098 set<pg_shard_t> peer_missing_requested;
1099
1100 // i deleted these strays; ignore racing PGInfo from them
1101 set<pg_shard_t> peer_activated;
1102
1103 // primary-only, recovery-only state
1104 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
1105 // which are unfound on the primary
1106 epoch_t last_peering_reset;
1107
1108 epoch_t get_last_peering_reset() const {
1109 return last_peering_reset;
1110 }
1111
1112 /* heartbeat peers */
1113 void set_probe_targets(const set<pg_shard_t> &probe_set);
1114 void clear_probe_targets();
1115
1116 Mutex heartbeat_peer_lock;
1117 set<int> heartbeat_peers;
1118 set<int> probe_targets;
1119
1120 public:
1121 /**
1122 * BackfillInterval
1123 *
1124 * Represents the objects in a range [begin, end)
1125 *
1126 * Possible states:
1127 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1128 * 2) Else, objects contains all objects in [begin, end)
1129 */
1130 struct BackfillInterval {
1131 // info about a backfill interval on a peer
1132 eversion_t version; /// version at which the scan occurred
1133 map<hobject_t,eversion_t> objects;
1134 hobject_t begin;
1135 hobject_t end;
1136
1137 /// clear content
1138 void clear() {
1139 *this = BackfillInterval();
1140 }
1141
1142 /// clear objects list only
1143 void clear_objects() {
1144 objects.clear();
1145 }
1146
1147 /// reinstantiate with a new start+end position and sort order
1148 void reset(hobject_t start) {
1149 clear();
1150 begin = end = start;
1151 }
1152
1153 /// true if there are no objects in this interval
1154 bool empty() const {
1155 return objects.empty();
1156 }
1157
1158 /// true if interval extends to the end of the range
1159 bool extends_to_end() const {
1160 return end.is_max();
1161 }
1162
1163 /// removes items <= soid and adjusts begin to the first object
1164 void trim_to(const hobject_t &soid) {
1165 trim();
1166 while (!objects.empty() &&
1167 objects.begin()->first <= soid) {
1168 pop_front();
1169 }
1170 }
1171
1172 /// Adjusts begin to the first object
1173 void trim() {
1174 if (!objects.empty())
1175 begin = objects.begin()->first;
1176 else
1177 begin = end;
1178 }
1179
1180 /// drop first entry, and adjust @begin accordingly
1181 void pop_front() {
1182 ceph_assert(!objects.empty());
1183 objects.erase(objects.begin());
1184 trim();
1185 }
1186
1187 /// dump
1188 void dump(Formatter *f) const {
1189 f->dump_stream("begin") << begin;
1190 f->dump_stream("end") << end;
1191 f->open_array_section("objects");
1192 for (map<hobject_t, eversion_t>::const_iterator i =
1193 objects.begin();
1194 i != objects.end();
1195 ++i) {
1196 f->open_object_section("object");
1197 f->dump_stream("object") << i->first;
1198 f->dump_stream("version") << i->second;
1199 f->close_section();
1200 }
1201 f->close_section();
1202 }
1203 };
1204
1205 protected:
1206 BackfillInterval backfill_info;
1207 map<pg_shard_t, BackfillInterval> peer_backfill_info;
1208 bool backfill_reserved;
1209 bool backfill_reserving;
1210
1211 set<pg_shard_t> backfill_targets, async_recovery_targets;
1212
1213 // The primary's num_bytes and local num_bytes for this pg, only valid
1214 // during backfill for non-primary shards.
1215 // Both of these are adjusted for EC to reflect the on-disk bytes
1216 std::atomic<int64_t> primary_num_bytes = 0;
1217 std::atomic<int64_t> local_num_bytes = 0;
1218
1219 public:
1220 bool is_backfill_targets(pg_shard_t osd) {
1221 return backfill_targets.count(osd);
1222 }
1223
1224 // Space reserved for backfill is primary_num_bytes - local_num_bytes
1225 // Don't care that difference itself isn't atomic
1226 uint64_t get_reserved_num_bytes() {
1227 int64_t primary = primary_num_bytes.load();
1228 int64_t local = local_num_bytes.load();
1229 if (primary > local)
1230 return primary - local;
1231 else
1232 return 0;
1233 }
1234
1235 bool is_remote_backfilling() {
1236 return primary_num_bytes.load() > 0;
1237 }
1238
1239 void set_reserved_num_bytes(int64_t primary, int64_t local);
1240 void clear_reserved_num_bytes();
1241
1242 // If num_bytes are inconsistent and local_num- goes negative
1243 // it's ok, because it would then be ignored.
1244
1245 // The value of num_bytes could be negative,
1246 // but we don't let local_num_bytes go negative.
1247 void add_local_num_bytes(int64_t num_bytes) {
1248 if (num_bytes) {
1249 int64_t prev_bytes = local_num_bytes.load();
1250 int64_t new_bytes;
1251 do {
1252 new_bytes = prev_bytes + num_bytes;
1253 if (new_bytes < 0)
1254 new_bytes = 0;
1255 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1256 }
1257 }
1258 void sub_local_num_bytes(int64_t num_bytes) {
1259 ceph_assert(num_bytes >= 0);
1260 if (num_bytes) {
1261 int64_t prev_bytes = local_num_bytes.load();
1262 int64_t new_bytes;
1263 do {
1264 new_bytes = prev_bytes - num_bytes;
1265 if (new_bytes < 0)
1266 new_bytes = 0;
1267 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1268 }
1269 }
1270 // The value of num_bytes could be negative,
1271 // but we don't let info.stats.stats.sum.num_bytes go negative.
1272 void add_num_bytes(int64_t num_bytes) {
1273 ceph_assert(_lock.is_locked_by_me());
1274 if (num_bytes) {
1275 info.stats.stats.sum.num_bytes += num_bytes;
1276 if (info.stats.stats.sum.num_bytes < 0) {
1277 info.stats.stats.sum.num_bytes = 0;
1278 }
1279 }
1280 }
1281 void sub_num_bytes(int64_t num_bytes) {
1282 ceph_assert(_lock.is_locked_by_me());
1283 ceph_assert(num_bytes >= 0);
1284 if (num_bytes) {
1285 info.stats.stats.sum.num_bytes -= num_bytes;
1286 if (info.stats.stats.sum.num_bytes < 0) {
1287 info.stats.stats.sum.num_bytes = 0;
1288 }
1289 }
1290 }
1291
1292 // Only used in testing so not worried about needing the PG lock here
1293 int64_t get_stats_num_bytes() {
1294 Mutex::Locker l(_lock);
1295 int num_bytes = info.stats.stats.sum.num_bytes;
1296 if (pool.info.is_erasure()) {
1297 num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1298 // Round up each object by a stripe
1299 num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1300 }
1301 int64_t lnb = local_num_bytes.load();
1302 if (lnb && lnb != num_bytes) {
1303 lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1304 << lnb << " vs stats "
1305 << info.stats.stats.sum.num_bytes << " / chunk "
1306 << get_pgbackend()->get_ec_data_chunk_count()
1307 << dendl;
1308 }
1309 return num_bytes;
1310 }
1311
1312 protected:
1313
1314 /*
1315 * blocked request wait hierarchy
1316 *
1317 * In order to preserve request ordering we need to be careful about the
1318 * order in which blocked requests get requeued. Generally speaking, we
1319 * push the requests back up to the op_wq in reverse order (most recent
1320 * request first) so that they come back out again in the original order.
1321 * However, because there are multiple wait queues, we need to requeue
1322 * waitlists in order. Generally speaking, we requeue the wait lists
1323 * that are checked first.
1324 *
1325 * Here are the various wait lists, in the order they are used during
1326 * request processing, with notes:
1327 *
1328 * - waiting_for_map
1329 * - may start or stop blocking at any time (depending on client epoch)
1330 * - waiting_for_peered
1331 * - !is_peered() or flushes_in_progress
1332 * - only starts blocking on interval change; never restarts
1333 * - waiting_for_active
1334 * - !is_active()
1335 * - only starts blocking on interval change; never restarts
1336 * - waiting_for_flush
1337 * - is_active() and flushes_in_progress
1338 * - waiting for final flush during activate
1339 * - waiting_for_scrub
1340 * - starts and stops blocking for varying intervals during scrub
1341 * - waiting_for_unreadable_object
1342 * - never restarts once object is readable (* except for EIO?)
1343 * - waiting_for_degraded_object
1344 * - never restarts once object is writeable (* except for EIO?)
1345 * - waiting_for_blocked_object
1346 * - starts and stops based on proxied op activity
1347 * - obc rwlocks
1348 * - starts and stops based on read/write activity
1349 *
1350 * Notes:
1351 *
1352 * 1. During and interval change, we requeue *everything* in the above order.
1353 *
1354 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1355 * the op there if it applies. We ignore the unreadable/degraded/blocked
1356 * queues because we assume they cannot apply at that time (this is
1357 * probably mostly true).
1358 *
1359 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1360 * it is non-empty.
1361 *
1362 * These three behaviors are generally sufficient to maintain ordering, with
1363 * the possible exception of cases where we make an object degraded or
1364 * unreadable that was previously okay, e.g. when scrub or op processing
1365 * encounter an unexpected error. FIXME.
1366 */
1367
1368 // pg waiters
1369 unsigned flushes_in_progress;
1370
1371 // ops with newer maps than our (or blocked behind them)
1372 // track these by client, since inter-request ordering doesn't otherwise
1373 // matter.
1374 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1375
1376 // ops waiting on peered
1377 list<OpRequestRef> waiting_for_peered;
1378
1379 // ops waiting on active (require peered as well)
1380 list<OpRequestRef> waiting_for_active;
1381 list<OpRequestRef> waiting_for_flush;
1382 list<OpRequestRef> waiting_for_scrub;
1383
1384 list<OpRequestRef> waiting_for_cache_not_full;
1385 list<OpRequestRef> waiting_for_clean_to_primary_repair;
1386 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1387 waiting_for_degraded_object,
1388 waiting_for_blocked_object;
1389
1390 set<hobject_t> objects_blocked_on_cache_full;
1391 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1392 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1393
1394 // Callbacks should assume pg (and nothing else) is locked
1395 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1396
1397 map<eversion_t,
1398 list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
1399
1400 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1401 void requeue_op(OpRequestRef op);
1402 void requeue_ops(list<OpRequestRef> &l);
1403
1404 // stats that persist lazily
1405 object_stat_collection_t unstable_stats;
1406
1407 // publish stats
1408 Mutex pg_stats_publish_lock;
1409 bool pg_stats_publish_valid;
1410 pg_stat_t pg_stats_publish;
1411
1412 void _update_calc_stats();
1413 void _update_blocked_by();
1414 friend class TestOpsSocketHook;
1415 void publish_stats_to_osd();
1416 void clear_publish_stats();
1417
1418 void clear_primary_state();
1419
1420 bool is_acting_recovery_backfill(pg_shard_t osd) const {
1421 return acting_recovery_backfill.count(osd);
1422 }
1423 bool is_acting(pg_shard_t osd) const {
1424 return has_shard(pool.info.is_erasure(), acting, osd);
1425 }
1426 bool is_up(pg_shard_t osd) const {
1427 return has_shard(pool.info.is_erasure(), up, osd);
1428 }
1429 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1430 if (ec) {
1431 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1432 } else {
1433 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1434 }
1435 }
1436
1437 bool needs_recovery() const;
1438 bool needs_backfill() const;
1439
1440 /// clip calculated priority to reasonable range
1441 int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
1442 /// get log recovery reservation priority
1443 unsigned get_recovery_priority();
1444 /// get backfill reservation priority
1445 unsigned get_backfill_priority();
1446 /// get priority for pg deletion
1447 unsigned get_delete_priority();
1448
1449 void try_mark_clean(); ///< mark an active pg clean
1450
1451 /// return [start,end) bounds for required past_intervals
1452 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1453 const pg_info_t &info,
1454 epoch_t oldest_map) {
1455 epoch_t start = std::max(
1456 info.history.last_epoch_clean ? info.history.last_epoch_clean :
1457 info.history.epoch_pool_created,
1458 oldest_map);
1459 epoch_t end = std::max(
1460 info.history.same_interval_since,
1461 info.history.epoch_pool_created);
1462 return make_pair(start, end);
1463 }
1464 void check_past_interval_bounds() const;
1465 PastIntervals::PriorSet build_prior();
1466
1467 void remove_down_peer_info(const OSDMapRef osdmap);
1468
1469 bool adjust_need_up_thru(const OSDMapRef osdmap);
1470
1471 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1472 virtual void dump_recovery_info(Formatter *f) const = 0;
1473
1474 void calc_min_last_complete_ondisk() {
1475 eversion_t min = last_complete_ondisk;
1476 ceph_assert(!acting_recovery_backfill.empty());
1477 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1478 i != acting_recovery_backfill.end();
1479 ++i) {
1480 if (*i == get_primary()) continue;
1481 if (peer_last_complete_ondisk.count(*i) == 0)
1482 return; // we don't have complete info
1483 eversion_t a = peer_last_complete_ondisk[*i];
1484 if (a < min)
1485 min = a;
1486 }
1487 if (min == min_last_complete_ondisk)
1488 return;
1489 min_last_complete_ondisk = min;
1490 return;
1491 }
1492
1493 virtual void calc_trim_to() = 0;
1494
1495 virtual void calc_trim_to_aggressive() = 0;
1496
1497 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1498 pg_missing_t& omissing, pg_shard_t from);
1499 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1500 pg_missing_t& omissing, pg_shard_t from);
1501 bool proc_replica_info(
1502 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1503
1504 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1505 PG *pg;
1506 ObjectStore::Transaction *t;
1507 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1508
1509 // LogEntryHandler
1510 void remove(const hobject_t &hoid) override {
1511 pg->get_pgbackend()->remove(hoid, t);
1512 }
1513 void try_stash(const hobject_t &hoid, version_t v) override {
1514 pg->get_pgbackend()->try_stash(hoid, v, t);
1515 }
1516 void rollback(const pg_log_entry_t &entry) override {
1517 ceph_assert(entry.can_rollback());
1518 pg->get_pgbackend()->rollback(entry, t);
1519 }
1520 void rollforward(const pg_log_entry_t &entry) override {
1521 pg->get_pgbackend()->rollforward(entry, t);
1522 }
1523 void trim(const pg_log_entry_t &entry) override {
1524 pg->get_pgbackend()->trim(entry, t);
1525 }
1526 };
1527
1528 void update_object_snap_mapping(
1529 ObjectStore::Transaction *t, const hobject_t &soid,
1530 const set<snapid_t> &snaps);
1531 void clear_object_snap_mapping(
1532 ObjectStore::Transaction *t, const hobject_t &soid);
1533 void remove_snap_mapped_object(
1534 ObjectStore::Transaction& t, const hobject_t& soid);
1535 void merge_log(
1536 ObjectStore::Transaction& t, pg_info_t &oinfo,
1537 pg_log_t &olog, pg_shard_t from);
1538 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1539 bool search_for_missing(
1540 const pg_info_t &oinfo, const pg_missing_t &omissing,
1541 pg_shard_t fromosd,
1542 RecoveryCtx*);
1543
1544 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1545
1546 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1547 const map<pg_shard_t, pg_info_t> &infos,
1548 bool restrict_to_up_acting,
1549 bool *history_les_bound) const;
1550 static void calc_ec_acting(
1551 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1552 unsigned size,
1553 const vector<int> &acting,
1554 const vector<int> &up,
1555 const map<pg_shard_t, pg_info_t> &all_info,
1556 bool restrict_to_up_acting,
1557 vector<int> *want,
1558 set<pg_shard_t> *backfill,
1559 set<pg_shard_t> *acting_backfill,
1560 ostream &ss);
1561 static void calc_replicated_acting(
1562 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1563 uint64_t force_auth_primary_missing_objects,
1564 unsigned size,
1565 const vector<int> &acting,
1566 const vector<int> &up,
1567 pg_shard_t up_primary,
1568 const map<pg_shard_t, pg_info_t> &all_info,
1569 bool restrict_to_up_acting,
1570 vector<int> *want,
1571 set<pg_shard_t> *backfill,
1572 set<pg_shard_t> *acting_backfill,
1573 const OSDMapRef osdmap,
1574 ostream &ss);
1575 void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1576 const pg_info_t &auth_info,
1577 vector<int> *want,
1578 set<pg_shard_t> *async_recovery,
1579 const OSDMapRef osdmap) const;
1580 void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1581 const pg_info_t &auth_info,
1582 vector<int> *want,
1583 set<pg_shard_t> *async_recovery,
1584 const OSDMapRef osdmap) const;
1585
1586 bool recoverable_and_ge_min_size(const vector<int> &want) const;
1587 bool choose_acting(pg_shard_t &auth_log_shard,
1588 bool restrict_to_up_acting,
1589 bool *history_les_bound);
1590 void build_might_have_unfound();
1591 void activate(
1592 ObjectStore::Transaction& t,
1593 epoch_t activation_epoch,
1594 map<int, map<spg_t,pg_query_t> >& query_map,
1595 map<int,
1596 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1597 RecoveryCtx *ctx);
1598
1599 struct C_PG_ActivateCommitted : public Context {
1600 PGRef pg;
1601 epoch_t epoch;
1602 epoch_t activation_epoch;
1603 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1604 : pg(p), epoch(e), activation_epoch(ae) {}
1605 void finish(int r) override {
1606 pg->_activate_committed(epoch, activation_epoch);
1607 }
1608 };
1609 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1610 void all_activated_and_committed();
1611
1612 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1613
1614 bool have_unfound() const {
1615 return missing_loc.have_unfound();
1616 }
1617 uint64_t get_num_unfound() const {
1618 return missing_loc.num_unfound();
1619 }
1620 bool all_missing_unfound() const {
1621 const auto& missing = pg_log.get_missing();
1622 if (!missing.have_missing())
1623 return false;
1624 for (auto& m : missing.get_items()) {
1625 if (!missing_loc.is_unfound(m.first))
1626 return false;
1627 }
1628 return true;
1629 }
1630
1631 virtual void check_local() = 0;
1632
1633 void purge_strays();
1634
1635 void update_heartbeat_peers();
1636
1637 Context *finish_sync_event;
1638
1639 Context *finish_recovery();
1640 void _finish_recovery(Context *c);
1641 struct C_PG_FinishRecovery : public Context {
1642 PGRef pg;
1643 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1644 void finish(int r) override {
1645 pg->_finish_recovery(this);
1646 }
1647 };
1648 void cancel_recovery();
1649 void clear_recovery_state();
1650 virtual void _clear_recovery_state() = 0;
1651 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1652 void start_recovery_op(const hobject_t& soid);
1653 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1654
1655 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1656
1657 friend class C_OSD_RepModify_Commit;
1658 friend class C_DeleteMore;
1659
1660 // -- backoff --
1661 Mutex backoff_lock; // orders inside Backoff::lock
1662 map<hobject_t,set<BackoffRef>> backoffs;
1663
1664 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1665 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1666 void release_backoffs(const hobject_t& o) {
1667 release_backoffs(o, o);
1668 }
1669 void clear_backoffs();
1670
1671 void add_pg_backoff(SessionRef s) {
1672 hobject_t begin = info.pgid.pgid.get_hobj_start();
1673 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1674 add_backoff(s, begin, end);
1675 }
1676 void release_pg_backoffs() {
1677 hobject_t begin = info.pgid.pgid.get_hobj_start();
1678 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1679 release_backoffs(begin, end);
1680 }
1681
1682 // -- scrub --
1683 public:
1684 struct Scrubber {
1685 Scrubber();
1686 ~Scrubber();
1687
1688 // metadata
1689 set<pg_shard_t> reserved_peers;
1690 bool reserved, reserve_failed;
1691 epoch_t epoch_start;
1692
1693 // common to both scrubs
1694 bool active;
1695 set<pg_shard_t> waiting_on_whom;
1696 int shallow_errors;
1697 int deep_errors;
1698 int fixed;
1699 ScrubMap primary_scrubmap;
1700 ScrubMapBuilder primary_scrubmap_pos;
1701 epoch_t replica_scrub_start = 0;
1702 ScrubMap replica_scrubmap;
1703 ScrubMapBuilder replica_scrubmap_pos;
1704 map<pg_shard_t, ScrubMap> received_maps;
1705 OpRequestRef active_rep_scrub;
1706 utime_t scrub_reg_stamp; // stamp we registered for
1707
1708 static utime_t scrub_must_stamp() { return utime_t(0,1); }
1709
1710 omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
1711
1712 // For async sleep
1713 bool sleeping = false;
1714 bool needs_sleep = true;
1715 utime_t sleep_start;
1716
1717 // flags to indicate explicitly requested scrubs (by admin)
1718 bool must_scrub, must_deep_scrub, must_repair, need_auto;
1719
1720 // Priority to use for scrub scheduling
1721 unsigned priority = 0;
1722
1723 bool time_for_deep;
1724 // this flag indicates whether we would like to do auto-repair of the PG or not
1725 bool auto_repair;
1726 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1727 bool check_repair;
1728 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1729 // we should deep scrub in order to auto repair
1730 bool deep_scrub_on_error;
1731
1732 // Maps from objects with errors to missing/inconsistent peers
1733 map<hobject_t, set<pg_shard_t>> missing;
1734 map<hobject_t, set<pg_shard_t>> inconsistent;
1735
1736 // Map from object with errors to good peers
1737 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1738
1739 // Cleaned map pending snap metadata scrub
1740 ScrubMap cleaned_meta_map;
1741
1742 void clean_meta_map(ScrubMap &for_meta_scrub) {
1743 if (end.is_max() ||
1744 cleaned_meta_map.objects.empty()) {
1745 cleaned_meta_map.swap(for_meta_scrub);
1746 } else {
1747 auto iter = cleaned_meta_map.objects.end();
1748 --iter; // not empty, see if clause
1749 auto begin = cleaned_meta_map.objects.begin();
1750 if (iter->first.has_snapset()) {
1751 ++iter;
1752 } else {
1753 while (iter != begin) {
1754 auto next = iter--;
1755 if (next->first.get_head() != iter->first.get_head()) {
1756 ++iter;
1757 break;
1758 }
1759 }
1760 }
1761 for_meta_scrub.objects.insert(begin, iter);
1762 cleaned_meta_map.objects.erase(begin, iter);
1763 }
1764 }
1765
1766 // digest updates which we are waiting on
1767 int num_digest_updates_pending;
1768
1769 // chunky scrub
1770 hobject_t start, end; // [start,end)
1771 hobject_t max_end; // Largest end that may have been sent to replicas
1772 eversion_t subset_last_update;
1773
1774 // chunky scrub state
1775 enum State {
1776 INACTIVE,
1777 NEW_CHUNK,
1778 WAIT_PUSHES,
1779 WAIT_LAST_UPDATE,
1780 BUILD_MAP,
1781 BUILD_MAP_DONE,
1782 WAIT_REPLICAS,
1783 COMPARE_MAPS,
1784 WAIT_DIGEST_UPDATES,
1785 FINISH,
1786 BUILD_MAP_REPLICA,
1787 } state;
1788
1789 std::unique_ptr<Scrub::Store> store;
1790 // deep scrub
1791 bool deep;
1792 int preempt_left;
1793 int preempt_divisor;
1794
1795 list<Context*> callbacks;
1796 void add_callback(Context *context) {
1797 callbacks.push_back(context);
1798 }
1799 void run_callbacks() {
1800 list<Context*> to_run;
1801 to_run.swap(callbacks);
1802 for (list<Context*>::iterator i = to_run.begin();
1803 i != to_run.end();
1804 ++i) {
1805 (*i)->complete(0);
1806 }
1807 }
1808
1809 static const char *state_string(const PG::Scrubber::State& state) {
1810 const char *ret = NULL;
1811 switch( state )
1812 {
1813 case INACTIVE: ret = "INACTIVE"; break;
1814 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1815 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1816 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1817 case BUILD_MAP: ret = "BUILD_MAP"; break;
1818 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1819 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1820 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1821 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1822 case FINISH: ret = "FINISH"; break;
1823 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1824 }
1825 return ret;
1826 }
1827
1828 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1829
1830 // clear all state
1831 void reset() {
1832 active = false;
1833 waiting_on_whom.clear();
1834 if (active_rep_scrub) {
1835 active_rep_scrub = OpRequestRef();
1836 }
1837 received_maps.clear();
1838
1839 must_scrub = false;
1840 must_deep_scrub = false;
1841 must_repair = false;
1842 need_auto = false;
1843 time_for_deep = false;
1844 auto_repair = false;
1845 check_repair = false;
1846 deep_scrub_on_error = false;
1847
1848 state = PG::Scrubber::INACTIVE;
1849 start = hobject_t();
1850 end = hobject_t();
1851 max_end = hobject_t();
1852 subset_last_update = eversion_t();
1853 shallow_errors = 0;
1854 deep_errors = 0;
1855 fixed = 0;
1856 omap_stats = (const struct omap_stat_t){ 0 };
1857 deep = false;
1858 run_callbacks();
1859 inconsistent.clear();
1860 missing.clear();
1861 authoritative.clear();
1862 num_digest_updates_pending = 0;
1863 primary_scrubmap = ScrubMap();
1864 primary_scrubmap_pos.reset();
1865 replica_scrubmap = ScrubMap();
1866 replica_scrubmap_pos.reset();
1867 cleaned_meta_map = ScrubMap();
1868 sleeping = false;
1869 needs_sleep = true;
1870 sleep_start = utime_t();
1871 }
1872
1873 void create_results(const hobject_t& obj);
1874 void cleanup_store(ObjectStore::Transaction *t);
1875 } scrubber;
1876
1877 protected:
1878 bool scrub_after_recovery;
1879
1880 int active_pushes;
1881
1882 bool scrub_can_preempt = false;
1883 bool scrub_preempted = false;
1884
1885 // we allow some number of preemptions of the scrub, which mean we do
1886 // not block. then we start to block. once we start blocking, we do
1887 // not stop until the scrub range is completed.
1888 bool write_blocked_by_scrub(const hobject_t &soid);
1889
1890 /// true if the given range intersects the scrub interval in any way
1891 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1892
1893 void repair_object(
1894 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1895 pg_shard_t bad_peer);
1896
1897 void chunky_scrub(ThreadPool::TPHandle &handle);
1898 void scrub_compare_maps();
1899 /**
1900 * return true if any inconsistency/missing is repaired, false otherwise
1901 */
1902 bool scrub_process_inconsistent();
1903 bool ops_blocked_by_scrub() const;
1904 void scrub_finish();
1905 void scrub_clear_state(bool keep_repair = false);
1906 void _scan_snaps(ScrubMap &map);
1907 void _repair_oinfo_oid(ScrubMap &map);
1908 void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
1909 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1910 hobject_t start, hobject_t end, bool deep,
1911 bool allow_preemption);
1912 int build_scrub_map_chunk(
1913 ScrubMap &map,
1914 ScrubMapBuilder &pos,
1915 hobject_t start, hobject_t end, bool deep,
1916 ThreadPool::TPHandle &handle);
1917 /**
1918 * returns true if [begin, end) is good to scrub at this time
1919 * a false return value obliges the implementer to requeue scrub when the
1920 * condition preventing scrub clears
1921 */
1922 virtual bool _range_available_for_scrub(
1923 const hobject_t &begin, const hobject_t &end) = 0;
1924 virtual void scrub_snapshot_metadata(
1925 ScrubMap &map,
1926 const std::map<hobject_t,
1927 pair<boost::optional<uint32_t>,
1928 boost::optional<uint32_t>>> &missing_digest) { }
1929 virtual void _scrub_clear_state() { }
1930 virtual void _scrub_finish() { }
1931 void clear_scrub_reserved();
1932 void scrub_reserve_replicas();
1933 void scrub_unreserve_replicas();
1934 bool scrub_all_replicas_reserved() const;
1935
1936 void replica_scrub(
1937 OpRequestRef op,
1938 ThreadPool::TPHandle &handle);
1939 void do_replica_scrub_map(OpRequestRef op);
1940
1941 void handle_scrub_reserve_request(OpRequestRef op);
1942 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1943 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1944 void handle_scrub_reserve_release(OpRequestRef op);
1945
1946 void reject_reservation();
1947 void schedule_backfill_retry(float retry);
1948 void schedule_recovery_retry(float retry);
1949
1950 // -- recovery state --
1951
1952 template <class EVT>
1953 struct QueuePeeringEvt : Context {
1954 PGRef pg;
1955 epoch_t epoch;
1956 EVT evt;
1957 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1958 pg(pg), epoch(epoch), evt(evt) {}
1959 void finish(int r) override {
1960 pg->lock();
1961 pg->queue_peering_event(PGPeeringEventRef(
1962 new PGPeeringEvent(
1963 epoch,
1964 epoch,
1965 evt)));
1966 pg->unlock();
1967 }
1968 };
1969
1970
1971 struct QueryState : boost::statechart::event< QueryState > {
1972 Formatter *f;
1973 explicit QueryState(Formatter *f) : f(f) {}
1974 void print(std::ostream *out) const {
1975 *out << "Query";
1976 }
1977 };
1978
1979 public:
1980 int pg_stat_adjust(osd_stat_t *new_stat);
1981 protected:
1982
1983 struct AdvMap : boost::statechart::event< AdvMap > {
1984 OSDMapRef osdmap;
1985 OSDMapRef lastmap;
1986 vector<int> newup, newacting;
1987 int up_primary, acting_primary;
1988 AdvMap(
1989 OSDMapRef osdmap, OSDMapRef lastmap,
1990 vector<int>& newup, int up_primary,
1991 vector<int>& newacting, int acting_primary):
1992 osdmap(osdmap), lastmap(lastmap),
1993 newup(newup),
1994 newacting(newacting),
1995 up_primary(up_primary),
1996 acting_primary(acting_primary) {}
1997 void print(std::ostream *out) const {
1998 *out << "AdvMap";
1999 }
2000 };
2001
2002 struct ActMap : boost::statechart::event< ActMap > {
2003 ActMap() : boost::statechart::event< ActMap >() {}
2004 void print(std::ostream *out) const {
2005 *out << "ActMap";
2006 }
2007 };
2008 struct Activate : boost::statechart::event< Activate > {
2009 epoch_t activation_epoch;
2010 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2011 activation_epoch(q) {}
2012 void print(std::ostream *out) const {
2013 *out << "Activate from " << activation_epoch;
2014 }
2015 };
2016 public:
2017 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2018 explicit UnfoundBackfill() {}
2019 void print(std::ostream *out) const {
2020 *out << "UnfoundBackfill";
2021 }
2022 };
2023 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2024 explicit UnfoundRecovery() {}
2025 void print(std::ostream *out) const {
2026 *out << "UnfoundRecovery";
2027 }
2028 };
2029
2030 struct RequestScrub : boost::statechart::event<RequestScrub> {
2031 bool deep;
2032 bool repair;
2033 explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2034 void print(std::ostream *out) const {
2035 *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2036 << (repair ? " repair" : "");
2037 }
2038 };
2039
2040 protected:
2041 TrivialEvent(Initialize)
2042 TrivialEvent(GotInfo)
2043 TrivialEvent(NeedUpThru)
2044 TrivialEvent(Backfilled)
2045 TrivialEvent(LocalBackfillReserved)
2046 TrivialEvent(RejectRemoteReservation)
2047 public:
2048 TrivialEvent(RequestBackfill)
2049 protected:
2050 TrivialEvent(RemoteRecoveryPreempted)
2051 TrivialEvent(RemoteBackfillPreempted)
2052 TrivialEvent(BackfillTooFull)
2053 TrivialEvent(RecoveryTooFull)
2054
2055 TrivialEvent(MakePrimary)
2056 TrivialEvent(MakeStray)
2057 TrivialEvent(NeedActingChange)
2058 TrivialEvent(IsIncomplete)
2059 TrivialEvent(IsDown)
2060
2061 TrivialEvent(AllReplicasRecovered)
2062 TrivialEvent(DoRecovery)
2063 TrivialEvent(LocalRecoveryReserved)
2064 public:
2065 protected:
2066 TrivialEvent(AllRemotesReserved)
2067 TrivialEvent(AllBackfillsReserved)
2068 TrivialEvent(GoClean)
2069
2070 TrivialEvent(AllReplicasActivated)
2071
2072 TrivialEvent(IntervalFlush)
2073
2074 public:
2075 TrivialEvent(DeleteStart)
2076 TrivialEvent(DeleteSome)
2077
2078 TrivialEvent(SetForceRecovery)
2079 TrivialEvent(UnsetForceRecovery)
2080 TrivialEvent(SetForceBackfill)
2081 TrivialEvent(UnsetForceBackfill)
2082
2083 protected:
2084 TrivialEvent(DeleteReserved)
2085 TrivialEvent(DeleteInterrupted)
2086
2087 /* Encapsulates PG recovery process */
2088 class RecoveryState {
2089 void start_handle(RecoveryCtx *new_ctx);
2090 void end_handle();
2091 public:
2092 void begin_block_outgoing();
2093 void end_block_outgoing();
2094 void clear_blocked_outgoing();
2095 private:
2096
2097 /* States */
2098 struct Initial;
2099 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2100 RecoveryState *state;
2101 public:
2102 PG *pg;
2103
2104 utime_t event_time;
2105 uint64_t event_count;
2106
2107 void clear_event_counters() {
2108 event_time = utime_t();
2109 event_count = 0;
2110 }
2111
2112 void log_enter(const char *state_name);
2113 void log_exit(const char *state_name, utime_t duration);
2114
2115 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2116
2117 /* Accessor functions for state methods */
2118 ObjectStore::Transaction* get_cur_transaction() {
2119 ceph_assert(state->rctx);
2120 ceph_assert(state->rctx->transaction);
2121 return state->rctx->transaction;
2122 }
2123
2124 void send_query(pg_shard_t to, const pg_query_t &query) {
2125 ceph_assert(state->rctx);
2126 ceph_assert(state->rctx->query_map);
2127 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2128 query;
2129 }
2130
2131 map<int, map<spg_t, pg_query_t> > *get_query_map() {
2132 ceph_assert(state->rctx);
2133 ceph_assert(state->rctx->query_map);
2134 return state->rctx->query_map;
2135 }
2136
2137 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
2138 ceph_assert(state->rctx);
2139 ceph_assert(state->rctx->info_map);
2140 return state->rctx->info_map;
2141 }
2142
2143 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2144
2145 void send_notify(pg_shard_t to,
2146 const pg_notify_t &info, const PastIntervals &pi) {
2147 ceph_assert(state->rctx);
2148 state->rctx->send_notify(to, info, pi);
2149 }
2150 };
2151 friend class RecoveryMachine;
2152
2153 /* States */
2154 // Initial
2155 // Reset
2156 // Start
2157 // Started
2158 // Primary
2159 // WaitActingChange
2160 // Peering
2161 // GetInfo
2162 // GetLog
2163 // GetMissing
2164 // WaitUpThru
2165 // Incomplete
2166 // Active
2167 // Activating
2168 // Clean
2169 // Recovered
2170 // Backfilling
2171 // WaitRemoteBackfillReserved
2172 // WaitLocalBackfillReserved
2173 // NotBackfilling
2174 // NotRecovering
2175 // Recovering
2176 // WaitRemoteRecoveryReserved
2177 // WaitLocalRecoveryReserved
2178 // ReplicaActive
2179 // RepNotRecovering
2180 // RepRecovering
2181 // RepWaitBackfillReserved
2182 // RepWaitRecoveryReserved
2183 // Stray
2184 // ToDelete
2185 // WaitDeleteReserved
2186 // Deleting
2187 // Crashed
2188
2189 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2190 explicit Crashed(my_context ctx);
2191 };
2192
2193 struct Reset;
2194
2195 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2196 explicit Initial(my_context ctx);
2197 void exit();
2198
2199 typedef boost::mpl::list <
2200 boost::statechart::transition< Initialize, Reset >,
2201 boost::statechart::custom_reaction< NullEvt >,
2202 boost::statechart::transition< boost::statechart::event_base, Crashed >
2203 > reactions;
2204
2205 boost::statechart::result react(const MNotifyRec&);
2206 boost::statechart::result react(const MInfoRec&);
2207 boost::statechart::result react(const MLogRec&);
2208 boost::statechart::result react(const boost::statechart::event_base&) {
2209 return discard_event();
2210 }
2211 };
2212
2213 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2214 explicit Reset(my_context ctx);
2215 void exit();
2216
2217 typedef boost::mpl::list <
2218 boost::statechart::custom_reaction< QueryState >,
2219 boost::statechart::custom_reaction< AdvMap >,
2220 boost::statechart::custom_reaction< ActMap >,
2221 boost::statechart::custom_reaction< NullEvt >,
2222 boost::statechart::custom_reaction< IntervalFlush >,
2223 boost::statechart::transition< boost::statechart::event_base, Crashed >
2224 > reactions;
2225 boost::statechart::result react(const QueryState& q);
2226 boost::statechart::result react(const AdvMap&);
2227 boost::statechart::result react(const ActMap&);
2228 boost::statechart::result react(const IntervalFlush&);
2229 boost::statechart::result react(const boost::statechart::event_base&) {
2230 return discard_event();
2231 }
2232 };
2233
2234 struct Start;
2235
2236 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2237 explicit Started(my_context ctx);
2238 void exit();
2239
2240 typedef boost::mpl::list <
2241 boost::statechart::custom_reaction< QueryState >,
2242 boost::statechart::custom_reaction< AdvMap >,
2243 boost::statechart::custom_reaction< IntervalFlush >,
2244 // ignored
2245 boost::statechart::custom_reaction< NullEvt >,
2246 boost::statechart::custom_reaction<SetForceRecovery>,
2247 boost::statechart::custom_reaction<UnsetForceRecovery>,
2248 boost::statechart::custom_reaction<SetForceBackfill>,
2249 boost::statechart::custom_reaction<UnsetForceBackfill>,
2250 boost::statechart::custom_reaction<RequestScrub>,
2251 // crash
2252 boost::statechart::transition< boost::statechart::event_base, Crashed >
2253 > reactions;
2254 boost::statechart::result react(const QueryState& q);
2255 boost::statechart::result react(const AdvMap&);
2256 boost::statechart::result react(const IntervalFlush&);
2257 boost::statechart::result react(const boost::statechart::event_base&) {
2258 return discard_event();
2259 }
2260 };
2261
2262 struct Primary;
2263 struct Stray;
2264
2265 struct Start : boost::statechart::state< Start, Started >, NamedState {
2266 explicit Start(my_context ctx);
2267 void exit();
2268
2269 typedef boost::mpl::list <
2270 boost::statechart::transition< MakePrimary, Primary >,
2271 boost::statechart::transition< MakeStray, Stray >
2272 > reactions;
2273 };
2274
2275 struct Peering;
2276 struct WaitActingChange;
2277 struct Incomplete;
2278 struct Down;
2279
2280 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2281 explicit Primary(my_context ctx);
2282 void exit();
2283
2284 typedef boost::mpl::list <
2285 boost::statechart::custom_reaction< ActMap >,
2286 boost::statechart::custom_reaction< MNotifyRec >,
2287 boost::statechart::transition< NeedActingChange, WaitActingChange >,
2288 boost::statechart::custom_reaction<SetForceRecovery>,
2289 boost::statechart::custom_reaction<UnsetForceRecovery>,
2290 boost::statechart::custom_reaction<SetForceBackfill>,
2291 boost::statechart::custom_reaction<UnsetForceBackfill>,
2292 boost::statechart::custom_reaction<RequestScrub>
2293 > reactions;
2294 boost::statechart::result react(const ActMap&);
2295 boost::statechart::result react(const MNotifyRec&);
2296 boost::statechart::result react(const SetForceRecovery&);
2297 boost::statechart::result react(const UnsetForceRecovery&);
2298 boost::statechart::result react(const SetForceBackfill&);
2299 boost::statechart::result react(const UnsetForceBackfill&);
2300 boost::statechart::result react(const RequestScrub&);
2301 };
2302
2303 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2304 NamedState {
2305 typedef boost::mpl::list <
2306 boost::statechart::custom_reaction< QueryState >,
2307 boost::statechart::custom_reaction< AdvMap >,
2308 boost::statechart::custom_reaction< MLogRec >,
2309 boost::statechart::custom_reaction< MInfoRec >,
2310 boost::statechart::custom_reaction< MNotifyRec >
2311 > reactions;
2312 explicit WaitActingChange(my_context ctx);
2313 boost::statechart::result react(const QueryState& q);
2314 boost::statechart::result react(const AdvMap&);
2315 boost::statechart::result react(const MLogRec&);
2316 boost::statechart::result react(const MInfoRec&);
2317 boost::statechart::result react(const MNotifyRec&);
2318 void exit();
2319 };
2320
2321 struct GetInfo;
2322 struct Active;
2323
2324 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2325 PastIntervals::PriorSet prior_set;
2326 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2327
2328 explicit Peering(my_context ctx);
2329 void exit();
2330
2331 typedef boost::mpl::list <
2332 boost::statechart::custom_reaction< QueryState >,
2333 boost::statechart::transition< Activate, Active >,
2334 boost::statechart::custom_reaction< AdvMap >
2335 > reactions;
2336 boost::statechart::result react(const QueryState& q);
2337 boost::statechart::result react(const AdvMap &advmap);
2338 };
2339
2340 struct WaitLocalRecoveryReserved;
2341 struct Activating;
2342 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2343 explicit Active(my_context ctx);
2344 void exit();
2345
2346 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2347 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2348 bool all_replicas_activated;
2349
2350 typedef boost::mpl::list <
2351 boost::statechart::custom_reaction< QueryState >,
2352 boost::statechart::custom_reaction< ActMap >,
2353 boost::statechart::custom_reaction< AdvMap >,
2354 boost::statechart::custom_reaction< MInfoRec >,
2355 boost::statechart::custom_reaction< MNotifyRec >,
2356 boost::statechart::custom_reaction< MLogRec >,
2357 boost::statechart::custom_reaction< MTrim >,
2358 boost::statechart::custom_reaction< Backfilled >,
2359 boost::statechart::custom_reaction< AllReplicasActivated >,
2360 boost::statechart::custom_reaction< DeferRecovery >,
2361 boost::statechart::custom_reaction< DeferBackfill >,
2362 boost::statechart::custom_reaction< UnfoundRecovery >,
2363 boost::statechart::custom_reaction< UnfoundBackfill >,
2364 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2365 boost::statechart::custom_reaction< RemoteReservationRevoked>,
2366 boost::statechart::custom_reaction< DoRecovery>
2367 > reactions;
2368 boost::statechart::result react(const QueryState& q);
2369 boost::statechart::result react(const ActMap&);
2370 boost::statechart::result react(const AdvMap&);
2371 boost::statechart::result react(const MInfoRec& infoevt);
2372 boost::statechart::result react(const MNotifyRec& notevt);
2373 boost::statechart::result react(const MLogRec& logevt);
2374 boost::statechart::result react(const MTrim& trimevt);
2375 boost::statechart::result react(const Backfilled&) {
2376 return discard_event();
2377 }
2378 boost::statechart::result react(const AllReplicasActivated&);
2379 boost::statechart::result react(const DeferRecovery& evt) {
2380 return discard_event();
2381 }
2382 boost::statechart::result react(const DeferBackfill& evt) {
2383 return discard_event();
2384 }
2385 boost::statechart::result react(const UnfoundRecovery& evt) {
2386 return discard_event();
2387 }
2388 boost::statechart::result react(const UnfoundBackfill& evt) {
2389 return discard_event();
2390 }
2391 boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2392 return discard_event();
2393 }
2394 boost::statechart::result react(const RemoteReservationRevoked&) {
2395 return discard_event();
2396 }
2397 boost::statechart::result react(const DoRecovery&) {
2398 return discard_event();
2399 }
2400 };
2401
2402 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2403 typedef boost::mpl::list<
2404 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2405 boost::statechart::custom_reaction<SetForceRecovery>,
2406 boost::statechart::custom_reaction<SetForceBackfill>
2407 > reactions;
2408 explicit Clean(my_context ctx);
2409 void exit();
2410 boost::statechart::result react(const boost::statechart::event_base&) {
2411 return discard_event();
2412 }
2413 };
2414
2415 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2416 typedef boost::mpl::list<
2417 boost::statechart::transition< GoClean, Clean >,
2418 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2419 boost::statechart::custom_reaction< AllReplicasActivated >
2420 > reactions;
2421 explicit Recovered(my_context ctx);
2422 void exit();
2423 boost::statechart::result react(const AllReplicasActivated&) {
2424 post_event(GoClean());
2425 return forward_event();
2426 }
2427 };
2428
2429 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2430 typedef boost::mpl::list<
2431 boost::statechart::custom_reaction< Backfilled >,
2432 boost::statechart::custom_reaction< DeferBackfill >,
2433 boost::statechart::custom_reaction< UnfoundBackfill >,
2434 boost::statechart::custom_reaction< RemoteReservationRejected >,
2435 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2436 boost::statechart::custom_reaction< RemoteReservationRevoked>
2437 > reactions;
2438 explicit Backfilling(my_context ctx);
2439 boost::statechart::result react(const RemoteReservationRejected& evt) {
2440 // for compat with old peers
2441 post_event(RemoteReservationRevokedTooFull());
2442 return discard_event();
2443 }
2444 void backfill_release_reservations();
2445 boost::statechart::result react(const Backfilled& evt);
2446 boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2447 boost::statechart::result react(const RemoteReservationRevoked& evt);
2448 boost::statechart::result react(const DeferBackfill& evt);
2449 boost::statechart::result react(const UnfoundBackfill& evt);
2450 void cancel_backfill();
2451 void exit();
2452 };
2453
2454 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2455 typedef boost::mpl::list<
2456 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2457 boost::statechart::custom_reaction< RemoteReservationRejected >,
2458 boost::statechart::custom_reaction< RemoteReservationRevoked >,
2459 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2460 > reactions;
2461 set<pg_shard_t>::const_iterator backfill_osd_it;
2462 explicit WaitRemoteBackfillReserved(my_context ctx);
2463 void retry();
2464 void exit();
2465 boost::statechart::result react(const RemoteBackfillReserved& evt);
2466 boost::statechart::result react(const RemoteReservationRejected& evt);
2467 boost::statechart::result react(const RemoteReservationRevoked& evt);
2468 };
2469
2470 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2471 typedef boost::mpl::list<
2472 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2473 > reactions;
2474 explicit WaitLocalBackfillReserved(my_context ctx);
2475 void exit();
2476 };
2477
2478 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2479 typedef boost::mpl::list<
2480 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2481 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2482 boost::statechart::custom_reaction< RemoteReservationRejected >
2483 > reactions;
2484 explicit NotBackfilling(my_context ctx);
2485 void exit();
2486 boost::statechart::result react(const RemoteBackfillReserved& evt);
2487 boost::statechart::result react(const RemoteReservationRejected& evt);
2488 };
2489
2490 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2491 typedef boost::mpl::list<
2492 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2493 boost::statechart::custom_reaction< DeferRecovery >,
2494 boost::statechart::custom_reaction< UnfoundRecovery >
2495 > reactions;
2496 explicit NotRecovering(my_context ctx);
2497 boost::statechart::result react(const DeferRecovery& evt) {
2498 /* no-op */
2499 return discard_event();
2500 }
2501 boost::statechart::result react(const UnfoundRecovery& evt) {
2502 /* no-op */
2503 return discard_event();
2504 }
2505 void exit();
2506 };
2507
2508 struct ToDelete;
2509 struct RepNotRecovering;
2510 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2511 explicit ReplicaActive(my_context ctx);
2512 void exit();
2513
2514 typedef boost::mpl::list <
2515 boost::statechart::custom_reaction< QueryState >,
2516 boost::statechart::custom_reaction< ActMap >,
2517 boost::statechart::custom_reaction< MQuery >,
2518 boost::statechart::custom_reaction< MInfoRec >,
2519 boost::statechart::custom_reaction< MLogRec >,
2520 boost::statechart::custom_reaction< MTrim >,
2521 boost::statechart::custom_reaction< Activate >,
2522 boost::statechart::custom_reaction< DeferRecovery >,
2523 boost::statechart::custom_reaction< DeferBackfill >,
2524 boost::statechart::custom_reaction< UnfoundRecovery >,
2525 boost::statechart::custom_reaction< UnfoundBackfill >,
2526 boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2527 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2528 boost::statechart::custom_reaction< RecoveryDone >,
2529 boost::statechart::transition<DeleteStart, ToDelete>
2530 > reactions;
2531 boost::statechart::result react(const QueryState& q);
2532 boost::statechart::result react(const MInfoRec& infoevt);
2533 boost::statechart::result react(const MLogRec& logevt);
2534 boost::statechart::result react(const MTrim& trimevt);
2535 boost::statechart::result react(const ActMap&);
2536 boost::statechart::result react(const MQuery&);
2537 boost::statechart::result react(const Activate&);
2538 boost::statechart::result react(const RecoveryDone&) {
2539 return discard_event();
2540 }
2541 boost::statechart::result react(const DeferRecovery& evt) {
2542 return discard_event();
2543 }
2544 boost::statechart::result react(const DeferBackfill& evt) {
2545 return discard_event();
2546 }
2547 boost::statechart::result react(const UnfoundRecovery& evt) {
2548 return discard_event();
2549 }
2550 boost::statechart::result react(const UnfoundBackfill& evt) {
2551 return discard_event();
2552 }
2553 boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2554 return discard_event();
2555 }
2556 boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2557 return discard_event();
2558 }
2559 };
2560
2561 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2562 typedef boost::mpl::list<
2563 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2564 // for compat with old peers
2565 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2566 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2567 boost::statechart::custom_reaction< BackfillTooFull >,
2568 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2569 boost::statechart::custom_reaction< RemoteBackfillPreempted >
2570 > reactions;
2571 explicit RepRecovering(my_context ctx);
2572 boost::statechart::result react(const RemoteRecoveryPreempted &evt);
2573 boost::statechart::result react(const BackfillTooFull &evt);
2574 boost::statechart::result react(const RemoteBackfillPreempted &evt);
2575 void exit();
2576 };
2577
2578 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2579 typedef boost::mpl::list<
2580 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2581 boost::statechart::custom_reaction< RejectRemoteReservation >,
2582 boost::statechart::custom_reaction< RemoteReservationRejected >,
2583 boost::statechart::custom_reaction< RemoteReservationCanceled >
2584 > reactions;
2585 explicit RepWaitBackfillReserved(my_context ctx);
2586 void exit();
2587 boost::statechart::result react(const RemoteBackfillReserved &evt);
2588 boost::statechart::result react(const RejectRemoteReservation &evt);
2589 boost::statechart::result react(const RemoteReservationRejected &evt);
2590 boost::statechart::result react(const RemoteReservationCanceled &evt);
2591 };
2592
2593 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2594 typedef boost::mpl::list<
2595 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2596 // for compat with old peers
2597 boost::statechart::custom_reaction< RemoteReservationRejected >,
2598 boost::statechart::custom_reaction< RemoteReservationCanceled >
2599 > reactions;
2600 explicit RepWaitRecoveryReserved(my_context ctx);
2601 void exit();
2602 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2603 boost::statechart::result react(const RemoteReservationRejected &evt) {
2604 // for compat with old peers
2605 post_event(RemoteReservationCanceled());
2606 return discard_event();
2607 }
2608 boost::statechart::result react(const RemoteReservationCanceled &evt);
2609 };
2610
2611 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2612 typedef boost::mpl::list<
2613 boost::statechart::custom_reaction< RequestRecoveryPrio >,
2614 boost::statechart::custom_reaction< RequestBackfillPrio >,
2615 boost::statechart::custom_reaction< RejectRemoteReservation >,
2616 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2617 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2618 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2619 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2620 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2621 > reactions;
2622 explicit RepNotRecovering(my_context ctx);
2623 boost::statechart::result react(const RequestRecoveryPrio &evt);
2624 boost::statechart::result react(const RequestBackfillPrio &evt);
2625 boost::statechart::result react(const RemoteBackfillReserved &evt) {
2626 // my reservation completion raced with a RELEASE from primary
2627 return discard_event();
2628 }
2629 boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2630 // my reservation completion raced with a RELEASE from primary
2631 return discard_event();
2632 }
2633 boost::statechart::result react(const RejectRemoteReservation &evt);
2634 void exit();
2635 };
2636
2637 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2638 typedef boost::mpl::list <
2639 boost::statechart::custom_reaction< AllReplicasRecovered >,
2640 boost::statechart::custom_reaction< DeferRecovery >,
2641 boost::statechart::custom_reaction< UnfoundRecovery >,
2642 boost::statechart::custom_reaction< RequestBackfill >
2643 > reactions;
2644 explicit Recovering(my_context ctx);
2645 void exit();
2646 void release_reservations(bool cancel = false);
2647 boost::statechart::result react(const AllReplicasRecovered &evt);
2648 boost::statechart::result react(const DeferRecovery& evt);
2649 boost::statechart::result react(const UnfoundRecovery& evt);
2650 boost::statechart::result react(const RequestBackfill &evt);
2651 };
2652
2653 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2654 typedef boost::mpl::list <
2655 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2656 boost::statechart::transition< AllRemotesReserved, Recovering >
2657 > reactions;
2658 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2659 explicit WaitRemoteRecoveryReserved(my_context ctx);
2660 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2661 void exit();
2662 };
2663
2664 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2665 typedef boost::mpl::list <
2666 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2667 boost::statechart::custom_reaction< RecoveryTooFull >
2668 > reactions;
2669 explicit WaitLocalRecoveryReserved(my_context ctx);
2670 void exit();
2671 boost::statechart::result react(const RecoveryTooFull &evt);
2672 };
2673
2674 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2675 typedef boost::mpl::list <
2676 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2677 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2678 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2679 > reactions;
2680 explicit Activating(my_context ctx);
2681 void exit();
2682 };
2683
2684 struct Stray : boost::statechart::state< Stray, Started >,
2685 NamedState {
2686 explicit Stray(my_context ctx);
2687 void exit();
2688
2689 typedef boost::mpl::list <
2690 boost::statechart::custom_reaction< MQuery >,
2691 boost::statechart::custom_reaction< MLogRec >,
2692 boost::statechart::custom_reaction< MInfoRec >,
2693 boost::statechart::custom_reaction< ActMap >,
2694 boost::statechart::custom_reaction< RecoveryDone >,
2695 boost::statechart::transition<DeleteStart, ToDelete>
2696 > reactions;
2697 boost::statechart::result react(const MQuery& query);
2698 boost::statechart::result react(const MLogRec& logevt);
2699 boost::statechart::result react(const MInfoRec& infoevt);
2700 boost::statechart::result react(const ActMap&);
2701 boost::statechart::result react(const RecoveryDone&) {
2702 return discard_event();
2703 }
2704 };
2705
2706 struct WaitDeleteReserved;
2707 struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2708 unsigned priority = 0;
2709 typedef boost::mpl::list <
2710 boost::statechart::custom_reaction< ActMap >,
2711 boost::statechart::custom_reaction< DeleteSome >
2712 > reactions;
2713 explicit ToDelete(my_context ctx);
2714 boost::statechart::result react(const ActMap &evt);
2715 boost::statechart::result react(const DeleteSome &evt) {
2716 // happens if we drop out of Deleting due to reprioritization etc.
2717 return discard_event();
2718 }
2719 void exit();
2720 };
2721
2722 struct Deleting;
2723 struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2724 ToDelete>, NamedState {
2725 typedef boost::mpl::list <
2726 boost::statechart::transition<DeleteReserved, Deleting>
2727 > reactions;
2728 explicit WaitDeleteReserved(my_context ctx);
2729 void exit();
2730 };
2731
2732 struct Deleting : boost::statechart::state<Deleting,
2733 ToDelete>, NamedState {
2734 typedef boost::mpl::list <
2735 boost::statechart::custom_reaction< DeleteSome >,
2736 boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2737 > reactions;
2738 explicit Deleting(my_context ctx);
2739 boost::statechart::result react(const DeleteSome &evt);
2740 void exit();
2741 };
2742
2743 struct GetLog;
2744
2745 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2746 set<pg_shard_t> peer_info_requested;
2747
2748 explicit GetInfo(my_context ctx);
2749 void exit();
2750 void get_infos();
2751
2752 typedef boost::mpl::list <
2753 boost::statechart::custom_reaction< QueryState >,
2754 boost::statechart::transition< GotInfo, GetLog >,
2755 boost::statechart::custom_reaction< MNotifyRec >,
2756 boost::statechart::transition< IsDown, Down >
2757 > reactions;
2758 boost::statechart::result react(const QueryState& q);
2759 boost::statechart::result react(const MNotifyRec& infoevt);
2760 };
2761
2762 struct GotLog : boost::statechart::event< GotLog > {
2763 GotLog() : boost::statechart::event< GotLog >() {}
2764 };
2765
2766 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2767 pg_shard_t auth_log_shard;
2768 boost::intrusive_ptr<MOSDPGLog> msg;
2769
2770 explicit GetLog(my_context ctx);
2771 void exit();
2772
2773 typedef boost::mpl::list <
2774 boost::statechart::custom_reaction< QueryState >,
2775 boost::statechart::custom_reaction< MLogRec >,
2776 boost::statechart::custom_reaction< GotLog >,
2777 boost::statechart::custom_reaction< AdvMap >,
2778 boost::statechart::transition< IsIncomplete, Incomplete >
2779 > reactions;
2780 boost::statechart::result react(const AdvMap&);
2781 boost::statechart::result react(const QueryState& q);
2782 boost::statechart::result react(const MLogRec& logevt);
2783 boost::statechart::result react(const GotLog&);
2784 };
2785
2786 struct WaitUpThru;
2787
2788 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2789 set<pg_shard_t> peer_missing_requested;
2790
2791 explicit GetMissing(my_context ctx);
2792 void exit();
2793
2794 typedef boost::mpl::list <
2795 boost::statechart::custom_reaction< QueryState >,
2796 boost::statechart::custom_reaction< MLogRec >,
2797 boost::statechart::transition< NeedUpThru, WaitUpThru >
2798 > reactions;
2799 boost::statechart::result react(const QueryState& q);
2800 boost::statechart::result react(const MLogRec& logevt);
2801 };
2802
2803 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2804 explicit WaitUpThru(my_context ctx);
2805 void exit();
2806
2807 typedef boost::mpl::list <
2808 boost::statechart::custom_reaction< QueryState >,
2809 boost::statechart::custom_reaction< ActMap >,
2810 boost::statechart::custom_reaction< MLogRec >
2811 > reactions;
2812 boost::statechart::result react(const QueryState& q);
2813 boost::statechart::result react(const ActMap& am);
2814 boost::statechart::result react(const MLogRec& logrec);
2815 };
2816
2817 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2818 explicit Down(my_context ctx);
2819 typedef boost::mpl::list <
2820 boost::statechart::custom_reaction< QueryState >,
2821 boost::statechart::custom_reaction< MNotifyRec >
2822 > reactions;
2823 boost::statechart::result react(const QueryState& q);
2824 boost::statechart::result react(const MNotifyRec& infoevt);
2825 void exit();
2826 };
2827
2828 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2829 typedef boost::mpl::list <
2830 boost::statechart::custom_reaction< AdvMap >,
2831 boost::statechart::custom_reaction< MNotifyRec >,
2832 boost::statechart::custom_reaction< QueryState >
2833 > reactions;
2834 explicit Incomplete(my_context ctx);
2835 boost::statechart::result react(const AdvMap &advmap);
2836 boost::statechart::result react(const MNotifyRec& infoevt);
2837 boost::statechart::result react(const QueryState& q);
2838 void exit();
2839 };
2840
2841 RecoveryMachine machine;
2842 PG *pg;
2843
2844 /// context passed in by state machine caller
2845 RecoveryCtx *orig_ctx;
2846
2847 /// populated if we are buffering messages pending a flush
2848 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2849
2850 /**
2851 * populated between start_handle() and end_handle(), points into
2852 * the message lists for messages_pending_flush while blocking messages
2853 * or into orig_ctx otherwise
2854 */
2855 boost::optional<RecoveryCtx> rctx;
2856
2857 public:
2858 explicit RecoveryState(PG *pg)
2859 : machine(this, pg), pg(pg), orig_ctx(0) {
2860 machine.initiate();
2861 }
2862
2863 void handle_event(const boost::statechart::event_base &evt,
2864 RecoveryCtx *rctx) {
2865 start_handle(rctx);
2866 machine.process_event(evt);
2867 end_handle();
2868 }
2869
2870 void handle_event(PGPeeringEventRef evt,
2871 RecoveryCtx *rctx) {
2872 start_handle(rctx);
2873 machine.process_event(evt->get_event());
2874 end_handle();
2875 }
2876
2877 } recovery_state;
2878
2879
2880
2881 uint64_t peer_features;
2882 uint64_t acting_features;
2883 uint64_t upacting_features;
2884
2885 epoch_t last_epoch;
2886
2887 /// most recently consumed osdmap's require_osd_version
2888 unsigned last_require_osd_release = 0;
2889 bool delete_needs_sleep = false;
2890
2891 protected:
2892 void reset_min_peer_features() {
2893 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2894 }
2895 uint64_t get_min_peer_features() const { return peer_features; }
2896 void apply_peer_features(uint64_t f) { peer_features &= f; }
2897
2898 uint64_t get_min_acting_features() const { return acting_features; }
2899 uint64_t get_min_upacting_features() const { return upacting_features; }
2900 bool perform_deletes_during_peering() const {
2901 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2902 }
2903
2904 bool hard_limit_pglog() const {
2905 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2906 }
2907
2908 void init_primary_up_acting(
2909 const vector<int> &newup,
2910 const vector<int> &newacting,
2911 int new_up_primary,
2912 int new_acting_primary) {
2913 actingset.clear();
2914 acting = newacting;
2915 for (uint8_t i = 0; i < acting.size(); ++i) {
2916 if (acting[i] != CRUSH_ITEM_NONE)
2917 actingset.insert(
2918 pg_shard_t(
2919 acting[i],
2920 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2921 }
2922 upset.clear();
2923 up = newup;
2924 for (uint8_t i = 0; i < up.size(); ++i) {
2925 if (up[i] != CRUSH_ITEM_NONE)
2926 upset.insert(
2927 pg_shard_t(
2928 up[i],
2929 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2930 }
2931 if (!pool.info.is_erasure()) {
2932 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2933 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2934 return;
2935 }
2936 up_primary = pg_shard_t();
2937 primary = pg_shard_t();
2938 for (uint8_t i = 0; i < up.size(); ++i) {
2939 if (up[i] == new_up_primary) {
2940 up_primary = pg_shard_t(up[i], shard_id_t(i));
2941 break;
2942 }
2943 }
2944 for (uint8_t i = 0; i < acting.size(); ++i) {
2945 if (acting[i] == new_acting_primary) {
2946 primary = pg_shard_t(acting[i], shard_id_t(i));
2947 break;
2948 }
2949 }
2950 ceph_assert(up_primary.osd == new_up_primary);
2951 ceph_assert(primary.osd == new_acting_primary);
2952 }
2953
2954 void set_role(int r) {
2955 role = r;
2956 }
2957
2958 bool state_test(uint64_t m) const { return (state & m) != 0; }
2959 void state_set(uint64_t m) { state |= m; }
2960 void state_clear(uint64_t m) { state &= ~m; }
2961
2962 bool is_complete() const { return info.last_complete == info.last_update; }
2963 bool should_send_notify() const { return send_notify; }
2964
2965 uint64_t get_state() const { return state; }
2966 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2967 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2968 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2969 bool is_down() const { return state_test(PG_STATE_DOWN); }
2970 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2971 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2972 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2973 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2974 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2975 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2976 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2977 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2978 bool is_peered() const {
2979 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2980 }
2981 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2982 bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2983 bool is_repair() const { return state_test(PG_STATE_REPAIR); }
2984
2985 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2986
2987 // pg on-disk state
2988 void do_pending_flush();
2989
2990 public:
2991 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2992 static void _init(ObjectStore::Transaction& t,
2993 spg_t pgid, const pg_pool_t *pool);
2994
2995 protected:
2996 void prepare_write_info(map<string,bufferlist> *km);
2997
2998 void update_store_with_options();
2999
3000 public:
3001 static int _prepare_write_info(
3002 CephContext* cct,
3003 map<string,bufferlist> *km,
3004 epoch_t epoch,
3005 pg_info_t &info,
3006 pg_info_t &last_written_info,
3007 PastIntervals &past_intervals,
3008 bool dirty_big_info,
3009 bool dirty_epoch,
3010 bool try_fast_info,
3011 PerfCounters *logger = nullptr);
3012
3013 void write_if_dirty(RecoveryCtx *rctx) {
3014 write_if_dirty(*rctx->transaction);
3015 }
3016 protected:
3017 void write_if_dirty(ObjectStore::Transaction& t);
3018
3019 PGLog::IndexedLog projected_log;
3020 bool check_in_progress_op(
3021 const osd_reqid_t &r,
3022 eversion_t *version,
3023 version_t *user_version,
3024 int *return_code) const;
3025 eversion_t projected_last_update;
3026 eversion_t get_next_version() const {
3027 eversion_t at_version(
3028 get_osdmap_epoch(),
3029 projected_last_update.version+1);
3030 ceph_assert(at_version > info.last_update);
3031 ceph_assert(at_version > pg_log.get_head());
3032 ceph_assert(at_version > projected_last_update);
3033 return at_version;
3034 }
3035
3036 void add_log_entry(const pg_log_entry_t& e, bool applied);
3037 void append_log(
3038 const vector<pg_log_entry_t>& logv,
3039 eversion_t trim_to,
3040 eversion_t roll_forward_to,
3041 ObjectStore::Transaction &t,
3042 bool transaction_applied = true,
3043 bool async = false);
3044 bool check_log_for_corruption(ObjectStore *store);
3045
3046 std::string get_corrupt_pg_log_name() const;
3047
3048 void update_snap_map(
3049 const vector<pg_log_entry_t> &log_entries,
3050 ObjectStore::Transaction& t);
3051
3052 void filter_snapc(vector<snapid_t> &snaps);
3053
3054 void log_weirdness();
3055
3056 virtual void kick_snap_trim() = 0;
3057 virtual void snap_trimmer_scrub_complete() = 0;
3058 bool requeue_scrub(bool high_priority = false);
3059 void queue_recovery();
3060 bool queue_scrub();
3061 unsigned get_scrub_priority();
3062
3063 /// share pg info after a pg is active
3064 void share_pg_info();
3065
3066
3067 bool append_log_entries_update_missing(
3068 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3069 ObjectStore::Transaction &t,
3070 boost::optional<eversion_t> trim_to,
3071 boost::optional<eversion_t> roll_forward_to);
3072
3073 /**
3074 * Merge entries updating missing as necessary on all
3075 * acting_recovery_backfill logs and missings (also missing_loc)
3076 */
3077 void merge_new_log_entries(
3078 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3079 ObjectStore::Transaction &t,
3080 boost::optional<eversion_t> trim_to,
3081 boost::optional<eversion_t> roll_forward_to);
3082
3083 void reset_interval_flush();
3084 void start_peering_interval(
3085 const OSDMapRef lastmap,
3086 const vector<int>& newup, int up_primary,
3087 const vector<int>& newacting, int acting_primary,
3088 ObjectStore::Transaction *t);
3089 void on_new_interval();
3090 virtual void _on_new_interval() = 0;
3091 void start_flush(ObjectStore::Transaction *t);
3092 void set_last_peering_reset();
3093
3094 void update_history(const pg_history_t& history);
3095 void fulfill_info(pg_shard_t from, const pg_query_t &query,
3096 pair<pg_shard_t, pg_info_t> &notify_info);
3097 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
3098 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
3099 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3100
3101 bool should_restart_peering(
3102 int newupprimary,
3103 int newactingprimary,
3104 const vector<int>& newup,
3105 const vector<int>& newacting,
3106 OSDMapRef lastmap,
3107 OSDMapRef osdmap);
3108
3109 // OpRequest queueing
3110 bool can_discard_op(OpRequestRef& op);
3111 bool can_discard_scan(OpRequestRef op);
3112 bool can_discard_backfill(OpRequestRef op);
3113 bool can_discard_request(OpRequestRef& op);
3114
3115 template<typename T, int MSGTYPE>
3116 bool can_discard_replica_op(OpRequestRef& op);
3117
3118 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
3119 bool old_peering_evt(PGPeeringEventRef evt) {
3120 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3121 }
3122 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3123 return e <= cur_epoch;
3124 }
3125 bool have_same_or_newer_map(epoch_t e) {
3126 return e <= get_osdmap_epoch();
3127 }
3128
3129 bool op_has_sufficient_caps(OpRequestRef& op);
3130
3131
3132 // recovery bits
3133 void take_waiters();
3134
3135
3136 // abstract bits
3137 friend class FlushState;
3138
3139 virtual void on_role_change() = 0;
3140 virtual void on_pool_change() = 0;
3141 virtual void on_change(ObjectStore::Transaction *t) = 0;
3142 virtual void on_activate() = 0;
3143 virtual void on_flushed() = 0;
3144 virtual void check_blacklisted_watchers() = 0;
3145
3146 friend ostream& operator<<(ostream& out, const PG& pg);
3147 };
3148
3149
3150 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3151
3152 #endif