]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / PG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PG_H
16#define CEPH_PG_H
17
18#include <boost/statechart/custom_reaction.hpp>
19#include <boost/statechart/event.hpp>
20#include <boost/statechart/simple_state.hpp>
21#include <boost/statechart/state.hpp>
22#include <boost/statechart/state_machine.hpp>
23#include <boost/statechart/transition.hpp>
24#include <boost/statechart/event_base.hpp>
25#include <boost/scoped_ptr.hpp>
26#include <boost/circular_buffer.hpp>
91327a77 27#include <boost/container/flat_set.hpp>
7c673cae
FG
28#include "include/mempool.h"
29
30// re-include our assert to clobber boost's
11fdf7f2 31#include "include/ceph_assert.h"
7c673cae
FG
32
33#include "include/types.h"
34#include "include/stringify.h"
35#include "osd_types.h"
36#include "include/xlist.h"
37#include "SnapMapper.h"
38#include "Session.h"
39#include "common/Timer.h"
40
41#include "PGLog.h"
42#include "OSDMap.h"
43#include "messages/MOSDPGLog.h"
44#include "include/str_list.h"
45#include "PGBackend.h"
11fdf7f2
TL
46#include "PGPeeringEvent.h"
47
48#include "mgr/OSDPerfMetricTypes.h"
7c673cae
FG
49
50#include <atomic>
51#include <list>
52#include <memory>
53#include <stack>
54#include <string>
55#include <tuple>
7c673cae
FG
56
57//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
11fdf7f2 58//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
7c673cae
FG
59
60class OSD;
61class OSDService;
11fdf7f2
TL
62class OSDShard;
63class OSDShardPGSlot;
7c673cae
FG
64class MOSDOp;
65class MOSDPGScan;
66class MOSDPGBackfill;
67class MOSDPGInfo;
68
69class PG;
70struct OpRequest;
71typedef OpRequest::Ref OpRequestRef;
72class MOSDPGLog;
73class CephContext;
11fdf7f2 74class DynamicPerfStats;
7c673cae
FG
75
76namespace Scrub {
77 class Store;
78}
79
7c673cae
FG
80using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81using embedded_state = std::pair<utime_t, const char*>;
82
83struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105};
106
107class PGStateHistory {
108 // Member access protected with the PG lock
109public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
a8e16298
TL
124 string get_current_state() {
125 if (pi == nullptr) return "unknown";
126 return std::get<1>(pi->embedded_states.top());
127 }
128
7c673cae
FG
129private:
130 bool pg_in_destructor = false;
131 PG* thispg = nullptr;
132 std::unique_ptr<PGStateInstance> tmppi;
133 PGStateInstance* pi = nullptr;
134 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
135
136};
137
138#ifdef PG_DEBUG_REFS
139#include "common/tracked_int_ptr.hpp"
140 uint64_t get_with_id(PG *pg);
141 void put_with_id(PG *pg, uint64_t id);
142 typedef TrackedIntPtr<PG> PGRef;
143#else
144 typedef boost::intrusive_ptr<PG> PGRef;
145#endif
146
147class PGRecoveryStats {
148 struct per_state_info {
149 uint64_t enter, exit; // enter/exit counts
150 uint64_t events;
151 utime_t event_time; // time spent processing events
152 utime_t total_time; // total time in state
153 utime_t min_time, max_time;
154
155 // cppcheck-suppress unreachableCode
156 per_state_info() : enter(0), exit(0), events(0) {}
157 };
158 map<const char *,per_state_info> info;
159 Mutex lock;
160
161 public:
162 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
163
164 void reset() {
11fdf7f2 165 std::lock_guard l(lock);
7c673cae
FG
166 info.clear();
167 }
168 void dump(ostream& out) {
11fdf7f2 169 std::lock_guard l(lock);
7c673cae
FG
170 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
171 per_state_info& i = p->second;
172 out << i.enter << "\t" << i.exit << "\t"
173 << i.events << "\t" << i.event_time << "\t"
174 << i.total_time << "\t"
175 << i.min_time << "\t" << i.max_time << "\t"
176 << p->first << "\n";
177 }
178 }
179
180 void dump_formatted(Formatter *f) {
11fdf7f2 181 std::lock_guard l(lock);
7c673cae
FG
182 f->open_array_section("pg_recovery_stats");
183 for (map<const char *,per_state_info>::iterator p = info.begin();
184 p != info.end(); ++p) {
185 per_state_info& i = p->second;
186 f->open_object_section("recovery_state");
187 f->dump_int("enter", i.enter);
188 f->dump_int("exit", i.exit);
189 f->dump_int("events", i.events);
190 f->dump_stream("event_time") << i.event_time;
191 f->dump_stream("total_time") << i.total_time;
192 f->dump_stream("min_time") << i.min_time;
193 f->dump_stream("max_time") << i.max_time;
194 vector<string> states;
195 get_str_vec(p->first, "/", states);
196 f->open_array_section("nested_states");
197 for (vector<string>::iterator st = states.begin();
198 st != states.end(); ++st) {
199 f->dump_string("state", *st);
200 }
201 f->close_section();
202 f->close_section();
203 }
204 f->close_section();
205 }
206
207 void log_enter(const char *s) {
11fdf7f2 208 std::lock_guard l(lock);
7c673cae
FG
209 info[s].enter++;
210 }
211 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
11fdf7f2 212 std::lock_guard l(lock);
7c673cae
FG
213 per_state_info &i = info[s];
214 i.exit++;
215 i.total_time += dur;
216 if (dur > i.max_time)
217 i.max_time = dur;
218 if (dur < i.min_time || i.min_time == utime_t())
219 i.min_time = dur;
220 i.events += events;
221 i.event_time += event_dur;
222 }
223};
224
225struct PGPool {
226 CephContext* cct;
227 epoch_t cached_epoch;
228 int64_t id;
229 string name;
7c673cae
FG
230
231 pg_pool_t info;
232 SnapContext snapc; // the default pool snapc, ready to go.
233
11fdf7f2 234 // these two sets are for < mimic only
7c673cae
FG
235 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
236 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
237
11fdf7f2
TL
238 PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
239 const string& name)
7c673cae
FG
240 : cct(cct),
241 cached_epoch(map->get_epoch()),
242 id(i),
11fdf7f2
TL
243 name(name),
244 info(info) {
245 snapc = info.get_snap_context();
246 if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
247 info.build_removed_snaps(cached_removed_snaps);
248 }
7c673cae
FG
249 }
250
11fdf7f2 251 void update(CephContext *cct, OSDMapRef map);
7c673cae
FG
252};
253
254/** PG - Replica Placement Group
255 *
256 */
257
258class PG : public DoutPrefixProvider {
a8e16298 259public:
11fdf7f2
TL
260 // -- members --
261 const spg_t pg_id;
262 const coll_t coll;
263
264 ObjectStore::CollectionHandle ch;
265
266 struct RecoveryCtx;
267
268 // -- methods --
269 std::ostream& gen_prefix(std::ostream& out) const override;
270 CephContext *get_cct() const override {
271 return cct;
272 }
273 unsigned get_subsys() const override {
274 return ceph_subsys_osd;
275 }
276
277 const OSDMapRef& get_osdmap() const {
278 ceph_assert(is_locked());
279 ceph_assert(osdmap_ref);
280 return osdmap_ref;
281 }
282 epoch_t get_osdmap_epoch() const {
283 return osdmap_ref->get_epoch();
284 }
285
286 void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
287 handle.suspend_tp_timeout();
288 lock();
289 handle.reset_tp_timeout();
290 }
291 void lock(bool no_lockdep = false) const;
292 void unlock() const {
293 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
294 ceph_assert(!dirty_info);
295 ceph_assert(!dirty_big_info);
296 _lock.Unlock();
297 }
298 bool is_locked() const {
299 return _lock.is_locked();
300 }
301
302 const spg_t& get_pgid() const {
303 return pg_id;
304 }
305
306 const PGPool& get_pool() const {
307 return pool;
308 }
309 uint64_t get_last_user_version() const {
310 return info.last_user_version;
311 }
312 const pg_history_t& get_history() const {
313 return info.history;
314 }
315 bool get_need_up_thru() const {
316 return need_up_thru;
317 }
318 epoch_t get_same_interval_since() const {
319 return info.history.same_interval_since;
320 }
321
322 void set_last_scrub_stamp(utime_t t) {
323 info.stats.last_scrub_stamp = t;
324 info.history.last_scrub_stamp = t;
325 }
326
327 void set_last_deep_scrub_stamp(utime_t t) {
328 info.stats.last_deep_scrub_stamp = t;
329 info.history.last_deep_scrub_stamp = t;
330 }
331
332 bool is_deleting() const {
333 return deleting;
334 }
335 bool is_deleted() const {
336 return deleted;
337 }
338 bool is_replica() const {
339 return role > 0;
340 }
341 bool is_primary() const {
342 return pg_whoami == primary;
343 }
344 bool pg_has_reset_since(epoch_t e) {
345 ceph_assert(is_locked());
346 return deleted || e < get_last_peering_reset();
347 }
348
349 bool is_ec_pg() const {
350 return pool.info.is_erasure();
351 }
352 int get_role() const {
353 return role;
354 }
355 const vector<int> get_acting() const {
356 return acting;
357 }
358 int get_acting_primary() const {
359 return primary.osd;
360 }
361 pg_shard_t get_primary() const {
362 return primary;
363 }
364 const vector<int> get_up() const {
365 return up;
366 }
367 int get_up_primary() const {
368 return up_primary.osd;
369 }
370 const PastIntervals& get_past_intervals() const {
371 return past_intervals;
372 }
373
374 /// initialize created PG
375 void init(
376 int role,
377 const vector<int>& up,
378 int up_primary,
379 const vector<int>& acting,
380 int acting_primary,
381 const pg_history_t& history,
382 const PastIntervals& pim,
383 bool backfill,
384 ObjectStore::Transaction *t);
385
386 /// read existing pg state off disk
387 void read_state(ObjectStore *store);
388 static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
389
390 static int get_latest_struct_v() {
391 return latest_struct_v;
392 }
393 static int get_compat_struct_v() {
394 return compat_struct_v;
395 }
396 static int read_info(
397 ObjectStore *store, spg_t pgid, const coll_t &coll,
398 pg_info_t &info, PastIntervals &past_intervals,
399 __u8 &);
400 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
401
402 void rm_backoff(BackoffRef b);
403
404 void update_snap_mapper_bits(uint32_t bits) {
405 snap_mapper.update_bits(bits);
406 }
407 void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
408 virtual void split_colls(
409 spg_t child,
410 int split_bits,
411 int seed,
412 const pg_pool_t *pool,
413 ObjectStore::Transaction *t) = 0;
414 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
415 void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
416 unsigned split_bits,
417 const pg_merge_meta_t& last_pg_merge_meta);
418 void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
419
420 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
421 void reg_next_scrub();
422 void unreg_next_scrub();
423
424 bool is_forced_recovery_or_backfill() const {
425 return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
426 }
a8e16298
TL
427 bool set_force_recovery(bool b);
428 bool set_force_backfill(bool b);
429
11fdf7f2
TL
430 void queue_peering_event(PGPeeringEventRef evt);
431 void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
432 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
433 void queue_flushed(epoch_t started_at);
434 void handle_advance_map(
435 OSDMapRef osdmap, OSDMapRef lastmap,
436 vector<int>& newup, int up_primary,
437 vector<int>& newacting, int acting_primary,
438 RecoveryCtx *rctx);
439 void handle_activate_map(RecoveryCtx *rctx);
440 void handle_initialize(RecoveryCtx *rctx);
441 void handle_query_state(Formatter *f);
442
443 /**
444 * @param ops_begun returns how many recovery ops the function started
445 * @returns true if any useful work was accomplished; false otherwise
446 */
447 virtual bool start_recovery_ops(
448 uint64_t max,
449 ThreadPool::TPHandle &handle,
450 uint64_t *ops_begun) = 0;
451
452 // more work after the above, but with a RecoveryCtx
453 void find_unfound(epoch_t queued, RecoveryCtx *rctx);
454
455 virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
456
457 void dump_pgstate_history(Formatter *f);
458 void dump_missing(Formatter *f);
459
460 void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
461 void with_heartbeat_peers(std::function<void(int)> f);
462
463 void shutdown();
464 virtual void on_shutdown() = 0;
465
466 bool get_must_scrub() const {
467 return scrubber.must_scrub;
468 }
469 bool sched_scrub();
470
471 virtual void do_request(
472 OpRequestRef& op,
473 ThreadPool::TPHandle &handle
474 ) = 0;
475 virtual void clear_cache() = 0;
476 virtual int get_cache_obj_count() = 0;
477
478 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
479 virtual int do_command(
480 cmdmap_t cmdmap,
481 ostream& ss,
482 bufferlist& idata,
483 bufferlist& odata,
484 ConnectionRef conn,
485 ceph_tid_t tid) = 0;
486
487 virtual bool agent_work(int max) = 0;
488 virtual bool agent_work(int max, int agent_flush_quota) = 0;
489 virtual void agent_stop() = 0;
490 virtual void agent_delay() = 0;
491 virtual void agent_clear() = 0;
492 virtual void agent_choose_mode_restart() = 0;
493
494 virtual void on_removal(ObjectStore::Transaction *t) = 0;
495
496 void _delete_some(ObjectStore::Transaction *t);
497
498 virtual void set_dynamic_perf_stats_queries(
499 const std::list<OSDPerfMetricQuery> &queries) {
500 }
501 virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
502 }
503
504 // reference counting
505#ifdef PG_DEBUG_REFS
506 uint64_t get_with_id();
507 void put_with_id(uint64_t);
508 void dump_live_ids();
509#endif
510 void get(const char* tag);
511 void put(const char* tag);
512 int get_num_ref() {
513 return ref;
514 }
515
516 // ctor
517 PG(OSDService *o, OSDMapRef curmap,
518 const PGPool &pool, spg_t p);
519 ~PG() override;
520
521 // prevent copying
522 explicit PG(const PG& rhs) = delete;
523 PG& operator=(const PG& rhs) = delete;
524
7c673cae 525protected:
11fdf7f2
TL
526 // -------------
527 // protected
7c673cae 528 OSDService *osd;
11fdf7f2
TL
529public:
530 OSDShard *osd_shard = nullptr;
531 OSDShardPGSlot *pg_slot = nullptr;
532protected:
7c673cae 533 CephContext *cct;
11fdf7f2
TL
534
535 // osdmap
536 OSDMapRef osdmap_ref;
537
538 PGPool pool;
539
540 // locking and reference counting.
541 // I destroy myself when the reference count hits zero.
542 // lock() should be called before doing anything.
543 // get() should be called on pointer copy (to another thread, etc.).
544 // put() should be called on destruction of some previously copied pointer.
545 // unlock() when done with the current pointer (_most common_).
546 mutable Mutex _lock = {"PG::_lock"};
547
548 std::atomic<unsigned int> ref{0};
549
550#ifdef PG_DEBUG_REFS
551 Mutex _ref_id_lock = {"PG::_ref_id_lock"};
552 map<uint64_t, string> _live_ids;
553 map<string, uint64_t> _tag_counts;
554 uint64_t _ref_id = 0;
555
556 friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
557 friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
558#endif
559
560private:
561 friend void intrusive_ptr_add_ref(PG *pg) {
562 pg->get("intptr");
563 }
564 friend void intrusive_ptr_release(PG *pg) {
565 pg->put("intptr");
566 }
567
568
569 // =====================
570
571protected:
7c673cae
FG
572 OSDriver osdriver;
573 SnapMapper snap_mapper;
224ce89b 574 bool eio_errors_to_process = false;
7c673cae
FG
575
576 virtual PGBackend *get_pgbackend() = 0;
11fdf7f2 577 virtual const PGBackend* get_pgbackend() const = 0;
7c673cae 578
11fdf7f2 579protected:
7c673cae 580 /*** PG ****/
7c673cae 581 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
11fdf7f2 582 IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
7c673cae
FG
583 return get_pgbackend()->get_is_recoverable_predicate();
584 }
585protected:
11fdf7f2 586 epoch_t last_persisted_osdmap;
7c673cae
FG
587
588 void requeue_map_waiters();
589
590 void update_osdmap_ref(OSDMapRef newmap) {
11fdf7f2 591 ceph_assert(_lock.is_locked_by_me());
7c673cae
FG
592 osdmap_ref = std::move(newmap);
593 }
594
7c673cae
FG
595protected:
596
7c673cae 597
7c673cae 598 bool deleting; // true while in removing or OSD is shutting down
11fdf7f2 599 atomic<bool> deleted = {false};
7c673cae
FG
600
601 ZTracer::Endpoint trace_endpoint;
602
7c673cae 603
11fdf7f2 604protected:
7c673cae
FG
605 bool dirty_info, dirty_big_info;
606
11fdf7f2 607protected:
7c673cae
FG
608 // pg state
609 pg_info_t info; ///< current pg info
610 pg_info_t last_written_info; ///< last written info
11fdf7f2
TL
611 __u8 info_struct_v = 0;
612 static const __u8 latest_struct_v = 10;
7c673cae
FG
613 // v10 is the new past_intervals encoding
614 // v9 was fastinfo_key addition
615 // v8 was the move to a per-pg pgmeta object
616 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
617 // (first appeared in cuttlefish).
11fdf7f2 618 static const __u8 compat_struct_v = 10;
7c673cae
FG
619 void upgrade(ObjectStore *store);
620
11fdf7f2 621protected:
7c673cae 622 PGLog pg_log;
7c673cae
FG
623 ghobject_t pgmeta_oid;
624
91327a77
AA
625 // ------------------
626 // MissingLoc
627
7c673cae 628 class MissingLoc {
91327a77
AA
629 public:
630 // a loc_count indicates how many locations we know in each of
631 // these distinct sets
632 struct loc_count_t {
633 int up = 0; //< up
634 int other = 0; //< other
635
636 friend bool operator<(const loc_count_t& l,
637 const loc_count_t& r) {
638 return (l.up < r.up ||
639 (l.up == r.up &&
640 (l.other < r.other)));
641 }
642 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
11fdf7f2
TL
643 ceph_assert(l.up >= 0);
644 ceph_assert(l.other >= 0);
91327a77
AA
645 return out << "(" << l.up << "+" << l.other << ")";
646 }
647 };
648
649
650 private:
651
652 loc_count_t _get_count(const set<pg_shard_t>& shards) {
653 loc_count_t r;
654 for (auto s : shards) {
655 if (pg->upset.count(s)) {
656 r.up++;
657 } else {
658 r.other++;
659 }
660 }
661 return r;
662 }
663
7c673cae
FG
664 map<hobject_t, pg_missing_item> needs_recovery_map;
665 map<hobject_t, set<pg_shard_t> > missing_loc;
666 set<pg_shard_t> missing_loc_sources;
91327a77
AA
667
668 // for every entry in missing_loc, we count how many of each type of shard we have,
669 // and maintain totals here. The sum of the values for this map will always equal
670 // missing_loc.size().
671 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
672
673 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
674 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
675 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
676 // For completely missing shards initialize with empty set<pg_shard_t>
677 for (int i = 0 ; i < num_shards ; ++i) {
678 shard_id_t shard(i);
679 pgsbs[shard];
680 }
681 for (auto pgs: s)
682 pgsbs[pgs.shard].insert(pgs);
683 } else {
684 pgsbs[shard_id_t::NO_SHARD] = s;
685 }
686 }
687
688 void _inc_count(const set<pg_shard_t>& s) {
689 map< shard_id_t, set<pg_shard_t> > pgsbs;
690 pgs_by_shard_id(s, pgsbs);
691 for (auto shard: pgsbs)
692 ++missing_by_count[shard.first][_get_count(shard.second)];
693 }
694 void _dec_count(const set<pg_shard_t>& s) {
695 map< shard_id_t, set<pg_shard_t> > pgsbs;
696 pgs_by_shard_id(s, pgsbs);
697 for (auto shard: pgsbs) {
698 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
11fdf7f2 699 ceph_assert(p != missing_by_count[shard.first].end());
91327a77
AA
700 if (--p->second == 0) {
701 missing_by_count[shard.first].erase(p);
702 }
703 }
704 }
705
7c673cae
FG
706 PG *pg;
707 set<pg_shard_t> empty_set;
708 public:
709 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
710 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
711 explicit MissingLoc(PG *pg)
91327a77 712 : pg(pg) { }
7c673cae
FG
713 void set_backend_predicates(
714 IsPGReadablePredicate *_is_readable,
715 IsPGRecoverablePredicate *_is_recoverable) {
716 is_readable.reset(_is_readable);
717 is_recoverable.reset(_is_recoverable);
718 }
11fdf7f2
TL
719 std::ostream& gen_prefix(std::ostream& out) const {
720 return pg->gen_prefix(out);
721 }
7c673cae
FG
722 bool needs_recovery(
723 const hobject_t &hoid,
724 eversion_t *v = 0) const {
725 map<hobject_t, pg_missing_item>::const_iterator i =
726 needs_recovery_map.find(hoid);
727 if (i == needs_recovery_map.end())
728 return false;
729 if (v)
730 *v = i->second.need;
731 return true;
732 }
c07f9fc5
FG
733 bool is_deleted(const hobject_t &hoid) const {
734 auto i = needs_recovery_map.find(hoid);
735 if (i == needs_recovery_map.end())
736 return false;
737 return i->second.is_delete();
738 }
7c673cae 739 bool is_unfound(const hobject_t &hoid) const {
11fdf7f2
TL
740 auto it = needs_recovery_map.find(hoid);
741 if (it == needs_recovery_map.end()) {
742 return false;
743 }
744 if (it->second.is_delete()) {
745 return false;
746 }
747 auto mit = missing_loc.find(hoid);
748 return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
7c673cae
FG
749 }
750 bool readable_with_acting(
751 const hobject_t &hoid,
752 const set<pg_shard_t> &acting) const;
753 uint64_t num_unfound() const {
754 uint64_t ret = 0;
755 for (map<hobject_t, pg_missing_item>::const_iterator i =
756 needs_recovery_map.begin();
757 i != needs_recovery_map.end();
758 ++i) {
11fdf7f2
TL
759 if (i->second.is_delete())
760 continue;
761 auto mi = missing_loc.find(i->first);
762 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
7c673cae
FG
763 ++ret;
764 }
765 return ret;
766 }
767
768 bool have_unfound() const {
769 for (map<hobject_t, pg_missing_item>::const_iterator i =
770 needs_recovery_map.begin();
771 i != needs_recovery_map.end();
772 ++i) {
11fdf7f2
TL
773 if (i->second.is_delete())
774 continue;
775 auto mi = missing_loc.find(i->first);
776 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
7c673cae
FG
777 return true;
778 }
779 return false;
780 }
781 void clear() {
782 needs_recovery_map.clear();
783 missing_loc.clear();
784 missing_loc_sources.clear();
91327a77 785 missing_by_count.clear();
7c673cae
FG
786 }
787
788 void add_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
789 auto p = missing_loc.find(hoid);
790 if (p == missing_loc.end()) {
791 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
792 } else {
793 _dec_count(p->second);
794 }
795 p->second.insert(location);
796 _inc_count(p->second);
7c673cae
FG
797 }
798 void remove_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
799 auto p = missing_loc.find(hoid);
800 if (p != missing_loc.end()) {
801 _dec_count(p->second);
802 p->second.erase(location);
11fdf7f2
TL
803 if (p->second.empty()) {
804 missing_loc.erase(p);
805 } else {
806 _inc_count(p->second);
807 }
91327a77 808 }
7c673cae 809 }
11fdf7f2
TL
810
811 void clear_location(const hobject_t &hoid) {
812 auto p = missing_loc.find(hoid);
813 if (p != missing_loc.end()) {
814 _dec_count(p->second);
815 missing_loc.erase(p);
816 }
817 }
818
7c673cae
FG
819 void add_active_missing(const pg_missing_t &missing) {
820 for (map<hobject_t, pg_missing_item>::const_iterator i =
821 missing.get_items().begin();
822 i != missing.get_items().end();
823 ++i) {
824 map<hobject_t, pg_missing_item>::const_iterator j =
825 needs_recovery_map.find(i->first);
826 if (j == needs_recovery_map.end()) {
827 needs_recovery_map.insert(*i);
828 } else {
c07f9fc5
FG
829 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
830 << i->first << " have " << j->second
831 << " tried to add " << i->second << dendl;
11fdf7f2 832 ceph_assert(i->second.need == j->second.need);
7c673cae
FG
833 }
834 }
835 }
836
11fdf7f2
TL
837 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
838 needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
7c673cae
FG
839 }
840 void revise_need(const hobject_t &hoid, eversion_t need) {
11fdf7f2
TL
841 auto it = needs_recovery_map.find(hoid);
842 ceph_assert(it != needs_recovery_map.end());
843 it->second.need = need;
7c673cae
FG
844 }
845
846 /// Adds info about a possible recovery source
847 bool add_source_info(
848 pg_shard_t source, ///< [in] source
849 const pg_info_t &oinfo, ///< [in] info
850 const pg_missing_t &omissing, ///< [in] (optional) missing
851 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
852 ); ///< @return whether a new object location was discovered
853
854 /// Adds recovery sources in batch
855 void add_batch_sources_info(
856 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
857 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
858 );
859
860 /// Uses osdmap to update structures for now down sources
861 void check_recovery_sources(const OSDMapRef& osdmap);
862
863 /// Call when hoid is no longer missing in acting set
864 void recovered(const hobject_t &hoid) {
865 needs_recovery_map.erase(hoid);
91327a77
AA
866 auto p = missing_loc.find(hoid);
867 if (p != missing_loc.end()) {
868 _dec_count(p->second);
869 missing_loc.erase(p);
870 }
7c673cae
FG
871 }
872
873 /// Call to update structures for hoid after a change
874 void rebuild(
875 const hobject_t &hoid,
876 pg_shard_t self,
877 const set<pg_shard_t> to_recover,
878 const pg_info_t &info,
879 const pg_missing_t &missing,
880 const map<pg_shard_t, pg_missing_t> &pmissing,
881 const map<pg_shard_t, pg_info_t> &pinfo) {
882 recovered(hoid);
883 boost::optional<pg_missing_item> item;
884 auto miter = missing.get_items().find(hoid);
885 if (miter != missing.get_items().end()) {
886 item = miter->second;
887 } else {
888 for (auto &&i: to_recover) {
889 if (i == self)
890 continue;
891 auto pmiter = pmissing.find(i);
11fdf7f2 892 ceph_assert(pmiter != pmissing.end());
7c673cae
FG
893 miter = pmiter->second.get_items().find(hoid);
894 if (miter != pmiter->second.get_items().end()) {
895 item = miter->second;
896 break;
897 }
898 }
899 }
900 if (!item)
901 return; // recovered!
902
903 needs_recovery_map[hoid] = *item;
c07f9fc5
FG
904 if (item->is_delete())
905 return;
7c673cae
FG
906 auto mliter =
907 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
11fdf7f2
TL
908 ceph_assert(info.last_backfill.is_max());
909 ceph_assert(info.last_update >= item->need);
7c673cae
FG
910 if (!missing.is_missing(hoid))
911 mliter->second.insert(self);
912 for (auto &&i: pmissing) {
11fdf7f2
TL
913 if (i.first == self)
914 continue;
7c673cae 915 auto pinfoiter = pinfo.find(i.first);
11fdf7f2 916 ceph_assert(pinfoiter != pinfo.end());
7c673cae
FG
917 if (item->need <= pinfoiter->second.last_update &&
918 hoid <= pinfoiter->second.last_backfill &&
919 !i.second.is_missing(hoid))
920 mliter->second.insert(i.first);
921 }
91327a77 922 _inc_count(mliter->second);
7c673cae
FG
923 }
924
925 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
11fdf7f2
TL
926 auto it = missing_loc.find(hoid);
927 return it == missing_loc.end() ? empty_set : it->second;
7c673cae
FG
928 }
929 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
930 return missing_loc;
931 }
932 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
933 return needs_recovery_map;
934 }
91327a77
AA
935 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
936 return missing_by_count;
937 }
7c673cae
FG
938 } missing_loc;
939
940 PastIntervals past_intervals;
941
942 interval_set<snapid_t> snap_trimq;
943
944 /* You should not use these items without taking their respective queue locks
945 * (if they have one) */
946 xlist<PG*>::item stat_queue_item;
947 bool scrub_queued;
948 bool recovery_queued;
949
950 int recovery_ops_active;
951 set<pg_shard_t> waiting_on_backfill;
952#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 953 multiset<hobject_t> recovering_oids;
7c673cae
FG
954#endif
955
956protected:
957 int role; // 0 = primary, 1 = replica, -1=none.
11fdf7f2 958 uint64_t state; // PG_STATE_*
7c673cae
FG
959
960 bool send_notify; ///< true if we are non-primary and should notify the primary
961
11fdf7f2 962protected:
7c673cae
FG
963 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
964 eversion_t last_complete_ondisk; // last_complete that has committed.
965 eversion_t last_update_applied;
966
11fdf7f2 967 // entries <= last_rollback_info_trimmed_to_applied have been trimmed
7c673cae
FG
968 eversion_t last_rollback_info_trimmed_to_applied;
969
970 // primary state
11fdf7f2 971protected:
7c673cae
FG
972 pg_shard_t primary;
973 pg_shard_t pg_whoami;
974 pg_shard_t up_primary;
975 vector<int> up, acting, want_acting;
11fdf7f2
TL
976 // acting_recovery_backfill contains shards that are acting,
977 // async recovery targets, or backfill targets.
978 set<pg_shard_t> acting_recovery_backfill, actingset, upset;
7c673cae
FG
979 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
980 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
981 eversion_t pg_trim_to;
982
983 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
984
11fdf7f2 985protected:
7c673cae 986 // [primary only] content recovery state
7c673cae
FG
987 struct BufferedRecoveryMessages {
988 map<int, map<spg_t, pg_query_t> > query_map;
989 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
990 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
991 };
992
94b18763
FG
993public:
994 bool dne() { return info.dne(); }
7c673cae
FG
995 struct RecoveryCtx {
996 utime_t start_time;
997 map<int, map<spg_t, pg_query_t> > *query_map;
998 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
999 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
7c673cae
FG
1000 ObjectStore::Transaction *transaction;
1001 ThreadPool::TPHandle* handle;
1002 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1003 map<int,
1004 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1005 map<int,
1006 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
7c673cae
FG
1007 ObjectStore::Transaction *transaction)
1008 : query_map(query_map), info_map(info_map),
1009 notify_list(notify_list),
7c673cae
FG
1010 transaction(transaction),
1011 handle(NULL) {}
1012
1013 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1014 : query_map(&(buf.query_map)),
1015 info_map(&(buf.info_map)),
1016 notify_list(&(buf.notify_list)),
7c673cae
FG
1017 transaction(rctx.transaction),
1018 handle(rctx.handle) {}
1019
1020 void accept_buffered_messages(BufferedRecoveryMessages &m) {
11fdf7f2
TL
1021 ceph_assert(query_map);
1022 ceph_assert(info_map);
1023 ceph_assert(notify_list);
7c673cae
FG
1024 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1025 i != m.query_map.end();
1026 ++i) {
1027 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1028 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1029 j != i->second.end();
1030 ++j) {
1031 omap[j->first] = j->second;
1032 }
1033 }
1034 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1035 = m.info_map.begin();
1036 i != m.info_map.end();
1037 ++i) {
1038 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1039 (*info_map)[i->first];
1040 ovec.reserve(ovec.size() + i->second.size());
1041 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1042 }
1043 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1044 = m.notify_list.begin();
1045 i != m.notify_list.end();
1046 ++i) {
1047 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1048 (*notify_list)[i->first];
1049 ovec.reserve(ovec.size() + i->second.size());
1050 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1051 }
1052 }
1adf2230
AA
1053
1054 void send_notify(pg_shard_t to,
1055 const pg_notify_t &info, const PastIntervals &pi) {
11fdf7f2 1056 ceph_assert(notify_list);
1adf2230
AA
1057 (*notify_list)[to.osd].push_back(make_pair(info, pi));
1058 }
7c673cae 1059 };
11fdf7f2 1060protected:
7c673cae
FG
1061
1062 PGStateHistory pgstate_history;
1063
1064 struct NamedState {
1065 const char *state_name;
1066 utime_t enter_time;
1067 PG* pg;
1068 const char *get_state_name() { return state_name; }
1069 NamedState(PG *pg_, const char *state_name_)
1070 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1071 pg->pgstate_history.enter(pg, enter_time, state_name);
1072 }
1073 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1074 };
1075
1076
1077
1078protected:
1079
1080 /*
1081 * peer_info -- projected (updates _before_ replicas ack)
1082 * peer_missing -- committed (updates _after_ replicas ack)
1083 */
1084
1085 bool need_up_thru;
1086 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
7c673cae 1087 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
11fdf7f2 1088 map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
7c673cae
FG
1089 set<pg_shard_t> peer_purged; // peers purged
1090 map<pg_shard_t, pg_missing_t> peer_missing;
1091 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
1092 set<pg_shard_t> peer_missing_requested;
1093
1094 // i deleted these strays; ignore racing PGInfo from them
1095 set<pg_shard_t> peer_activated;
1096
1097 // primary-only, recovery-only state
1098 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
1099 // which are unfound on the primary
1100 epoch_t last_peering_reset;
1101
11fdf7f2
TL
1102 epoch_t get_last_peering_reset() const {
1103 return last_peering_reset;
1104 }
7c673cae
FG
1105
1106 /* heartbeat peers */
1107 void set_probe_targets(const set<pg_shard_t> &probe_set);
1108 void clear_probe_targets();
11fdf7f2 1109
7c673cae
FG
1110 Mutex heartbeat_peer_lock;
1111 set<int> heartbeat_peers;
1112 set<int> probe_targets;
1113
11fdf7f2 1114public:
7c673cae
FG
1115 /**
1116 * BackfillInterval
1117 *
1118 * Represents the objects in a range [begin, end)
1119 *
1120 * Possible states:
1121 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1122 * 2) Else, objects contains all objects in [begin, end)
1123 */
1124 struct BackfillInterval {
1125 // info about a backfill interval on a peer
1126 eversion_t version; /// version at which the scan occurred
1127 map<hobject_t,eversion_t> objects;
1128 hobject_t begin;
1129 hobject_t end;
1130
1131 /// clear content
1132 void clear() {
1133 *this = BackfillInterval();
1134 }
1135
1136 /// clear objects list only
1137 void clear_objects() {
1138 objects.clear();
1139 }
1140
1141 /// reinstantiate with a new start+end position and sort order
1142 void reset(hobject_t start) {
1143 clear();
1144 begin = end = start;
1145 }
1146
1147 /// true if there are no objects in this interval
1148 bool empty() const {
1149 return objects.empty();
1150 }
1151
1152 /// true if interval extends to the end of the range
1153 bool extends_to_end() const {
1154 return end.is_max();
1155 }
1156
1157 /// removes items <= soid and adjusts begin to the first object
1158 void trim_to(const hobject_t &soid) {
1159 trim();
1160 while (!objects.empty() &&
1161 objects.begin()->first <= soid) {
1162 pop_front();
1163 }
1164 }
1165
1166 /// Adjusts begin to the first object
1167 void trim() {
1168 if (!objects.empty())
1169 begin = objects.begin()->first;
1170 else
1171 begin = end;
1172 }
1173
1174 /// drop first entry, and adjust @begin accordingly
1175 void pop_front() {
11fdf7f2 1176 ceph_assert(!objects.empty());
7c673cae
FG
1177 objects.erase(objects.begin());
1178 trim();
1179 }
1180
1181 /// dump
1182 void dump(Formatter *f) const {
1183 f->dump_stream("begin") << begin;
1184 f->dump_stream("end") << end;
1185 f->open_array_section("objects");
1186 for (map<hobject_t, eversion_t>::const_iterator i =
1187 objects.begin();
1188 i != objects.end();
1189 ++i) {
1190 f->open_object_section("object");
1191 f->dump_stream("object") << i->first;
1192 f->dump_stream("version") << i->second;
1193 f->close_section();
1194 }
1195 f->close_section();
1196 }
1197 };
1198
1199protected:
1200 BackfillInterval backfill_info;
1201 map<pg_shard_t, BackfillInterval> peer_backfill_info;
1202 bool backfill_reserved;
1203 bool backfill_reserving;
1204
11fdf7f2 1205 set<pg_shard_t> backfill_targets, async_recovery_targets;
7c673cae 1206
11fdf7f2
TL
1207 // The primary's num_bytes and local num_bytes for this pg, only valid
1208 // during backfill for non-primary shards.
1209 // Both of these are adjusted for EC to reflect the on-disk bytes
1210 std::atomic<int64_t> primary_num_bytes = 0;
1211 std::atomic<int64_t> local_num_bytes = 0;
7c673cae 1212
11fdf7f2 1213public:
7c673cae
FG
1214 bool is_backfill_targets(pg_shard_t osd) {
1215 return backfill_targets.count(osd);
1216 }
1217
11fdf7f2
TL
1218 // Space reserved for backfill is primary_num_bytes - local_num_bytes
1219 // Don't care that difference itself isn't atomic
1220 uint64_t get_reserved_num_bytes() {
1221 int64_t primary = primary_num_bytes.load();
1222 int64_t local = local_num_bytes.load();
1223 if (primary > local)
1224 return primary - local;
1225 else
1226 return 0;
1227 }
1228
1229 bool is_remote_backfilling() {
1230 return primary_num_bytes.load() > 0;
1231 }
1232
1233 void set_reserved_num_bytes(int64_t primary, int64_t local);
1234 void clear_reserved_num_bytes();
1235
1236 // If num_bytes are inconsistent and local_num- goes negative
1237 // it's ok, because it would then be ignored.
1238
1239 // The value of num_bytes could be negative,
1240 // but we don't let local_num_bytes go negative.
1241 void add_local_num_bytes(int64_t num_bytes) {
1242 if (num_bytes) {
1243 int64_t prev_bytes = local_num_bytes.load();
1244 int64_t new_bytes;
1245 do {
1246 new_bytes = prev_bytes + num_bytes;
1247 if (new_bytes < 0)
1248 new_bytes = 0;
1249 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1250 }
1251 }
1252 void sub_local_num_bytes(int64_t num_bytes) {
1253 ceph_assert(num_bytes >= 0);
1254 if (num_bytes) {
1255 int64_t prev_bytes = local_num_bytes.load();
1256 int64_t new_bytes;
1257 do {
1258 new_bytes = prev_bytes - num_bytes;
1259 if (new_bytes < 0)
1260 new_bytes = 0;
1261 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1262 }
1263 }
1264 // The value of num_bytes could be negative,
1265 // but we don't let info.stats.stats.sum.num_bytes go negative.
1266 void add_num_bytes(int64_t num_bytes) {
1267 ceph_assert(_lock.is_locked_by_me());
1268 if (num_bytes) {
1269 info.stats.stats.sum.num_bytes += num_bytes;
1270 if (info.stats.stats.sum.num_bytes < 0) {
1271 info.stats.stats.sum.num_bytes = 0;
1272 }
1273 }
1274 }
1275 void sub_num_bytes(int64_t num_bytes) {
1276 ceph_assert(_lock.is_locked_by_me());
1277 ceph_assert(num_bytes >= 0);
1278 if (num_bytes) {
1279 info.stats.stats.sum.num_bytes -= num_bytes;
1280 if (info.stats.stats.sum.num_bytes < 0) {
1281 info.stats.stats.sum.num_bytes = 0;
1282 }
1283 }
1284 }
1285
1286 // Only used in testing so not worried about needing the PG lock here
1287 int64_t get_stats_num_bytes() {
1288 Mutex::Locker l(_lock);
1289 int num_bytes = info.stats.stats.sum.num_bytes;
1290 if (pool.info.is_erasure()) {
1291 num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1292 // Round up each object by a stripe
1293 num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1294 }
1295 int64_t lnb = local_num_bytes.load();
1296 if (lnb && lnb != num_bytes) {
1297 lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1298 << lnb << " vs stats "
1299 << info.stats.stats.sum.num_bytes << " / chunk "
1300 << get_pgbackend()->get_ec_data_chunk_count()
1301 << dendl;
1302 }
1303 return num_bytes;
1304 }
1305
7c673cae
FG
1306protected:
1307
1308 /*
1309 * blocked request wait hierarchy
1310 *
1311 * In order to preserve request ordering we need to be careful about the
1312 * order in which blocked requests get requeued. Generally speaking, we
1313 * push the requests back up to the op_wq in reverse order (most recent
1314 * request first) so that they come back out again in the original order.
1315 * However, because there are multiple wait queues, we need to requeue
1316 * waitlists in order. Generally speaking, we requeue the wait lists
1317 * that are checked first.
1318 *
1319 * Here are the various wait lists, in the order they are used during
1320 * request processing, with notes:
1321 *
1322 * - waiting_for_map
1323 * - may start or stop blocking at any time (depending on client epoch)
1324 * - waiting_for_peered
1325 * - !is_peered() or flushes_in_progress
1326 * - only starts blocking on interval change; never restarts
1327 * - waiting_for_active
1328 * - !is_active()
1329 * - only starts blocking on interval change; never restarts
b32b8144
FG
1330 * - waiting_for_flush
1331 * - is_active() and flushes_in_progress
1332 * - waiting for final flush during activate
7c673cae
FG
1333 * - waiting_for_scrub
1334 * - starts and stops blocking for varying intervals during scrub
1335 * - waiting_for_unreadable_object
1336 * - never restarts once object is readable (* except for EIO?)
1337 * - waiting_for_degraded_object
1338 * - never restarts once object is writeable (* except for EIO?)
1339 * - waiting_for_blocked_object
1340 * - starts and stops based on proxied op activity
1341 * - obc rwlocks
1342 * - starts and stops based on read/write activity
1343 *
1344 * Notes:
1345 *
1346 * 1. During and interval change, we requeue *everything* in the above order.
1347 *
1348 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1349 * the op there if it applies. We ignore the unreadable/degraded/blocked
1350 * queues because we assume they cannot apply at that time (this is
1351 * probably mostly true).
1352 *
1353 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1354 * it is non-empty.
1355 *
1356 * These three behaviors are generally sufficient to maintain ordering, with
1357 * the possible exception of cases where we make an object degraded or
1358 * unreadable that was previously okay, e.g. when scrub or op processing
1359 * encounter an unexpected error. FIXME.
1360 */
1361
1362 // pg waiters
1363 unsigned flushes_in_progress;
1364
1365 // ops with newer maps than our (or blocked behind them)
1366 // track these by client, since inter-request ordering doesn't otherwise
1367 // matter.
1368 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1369
1370 // ops waiting on peered
1371 list<OpRequestRef> waiting_for_peered;
1372
1373 // ops waiting on active (require peered as well)
1374 list<OpRequestRef> waiting_for_active;
b32b8144 1375 list<OpRequestRef> waiting_for_flush;
7c673cae
FG
1376 list<OpRequestRef> waiting_for_scrub;
1377
1378 list<OpRequestRef> waiting_for_cache_not_full;
224ce89b 1379 list<OpRequestRef> waiting_for_clean_to_primary_repair;
7c673cae
FG
1380 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1381 waiting_for_degraded_object,
1382 waiting_for_blocked_object;
1383
1384 set<hobject_t> objects_blocked_on_cache_full;
1385 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1386 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1387
1388 // Callbacks should assume pg (and nothing else) is locked
1389 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1390
1391 map<eversion_t,
11fdf7f2 1392 list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
7c673cae
FG
1393
1394 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1395 void requeue_op(OpRequestRef op);
1396 void requeue_ops(list<OpRequestRef> &l);
1397
1398 // stats that persist lazily
1399 object_stat_collection_t unstable_stats;
1400
1401 // publish stats
1402 Mutex pg_stats_publish_lock;
1403 bool pg_stats_publish_valid;
1404 pg_stat_t pg_stats_publish;
1405
7c673cae
FG
1406 void _update_calc_stats();
1407 void _update_blocked_by();
a8e16298 1408 friend class TestOpsSocketHook;
7c673cae
FG
1409 void publish_stats_to_osd();
1410 void clear_publish_stats();
1411
7c673cae
FG
1412 void clear_primary_state();
1413
11fdf7f2
TL
1414 bool is_acting_recovery_backfill(pg_shard_t osd) const {
1415 return acting_recovery_backfill.count(osd);
7c673cae
FG
1416 }
1417 bool is_acting(pg_shard_t osd) const {
11fdf7f2 1418 return has_shard(pool.info.is_erasure(), acting, osd);
7c673cae
FG
1419 }
1420 bool is_up(pg_shard_t osd) const {
11fdf7f2 1421 return has_shard(pool.info.is_erasure(), up, osd);
7c673cae
FG
1422 }
1423 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1424 if (ec) {
1425 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1426 } else {
1427 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1428 }
1429 }
1430
1431 bool needs_recovery() const;
1432 bool needs_backfill() const;
1433
c07f9fc5 1434 /// clip calculated priority to reasonable range
81eedcae 1435 int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
7c673cae
FG
1436 /// get log recovery reservation priority
1437 unsigned get_recovery_priority();
1438 /// get backfill reservation priority
1439 unsigned get_backfill_priority();
11fdf7f2
TL
1440 /// get priority for pg deletion
1441 unsigned get_delete_priority();
7c673cae 1442
11fdf7f2 1443 void try_mark_clean(); ///< mark an active pg clean
7c673cae
FG
1444
1445 /// return [start,end) bounds for required past_intervals
1446 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1447 const pg_info_t &info,
1448 epoch_t oldest_map) {
11fdf7f2 1449 epoch_t start = std::max(
7c673cae 1450 info.history.last_epoch_clean ? info.history.last_epoch_clean :
31f18b77 1451 info.history.epoch_pool_created,
7c673cae 1452 oldest_map);
11fdf7f2 1453 epoch_t end = std::max(
7c673cae 1454 info.history.same_interval_since,
31f18b77 1455 info.history.epoch_pool_created);
7c673cae
FG
1456 return make_pair(start, end);
1457 }
1458 void check_past_interval_bounds() const;
1459 PastIntervals::PriorSet build_prior();
1460
1461 void remove_down_peer_info(const OSDMapRef osdmap);
1462
1463 bool adjust_need_up_thru(const OSDMapRef osdmap);
1464
1465 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1466 virtual void dump_recovery_info(Formatter *f) const = 0;
1467
11fdf7f2 1468 void calc_min_last_complete_ondisk() {
7c673cae 1469 eversion_t min = last_complete_ondisk;
11fdf7f2
TL
1470 ceph_assert(!acting_recovery_backfill.empty());
1471 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1472 i != acting_recovery_backfill.end();
7c673cae
FG
1473 ++i) {
1474 if (*i == get_primary()) continue;
1475 if (peer_last_complete_ondisk.count(*i) == 0)
11fdf7f2 1476 return; // we don't have complete info
7c673cae
FG
1477 eversion_t a = peer_last_complete_ondisk[*i];
1478 if (a < min)
1479 min = a;
1480 }
1481 if (min == min_last_complete_ondisk)
11fdf7f2 1482 return;
7c673cae 1483 min_last_complete_ondisk = min;
11fdf7f2 1484 return;
7c673cae
FG
1485 }
1486
1487 virtual void calc_trim_to() = 0;
1488
f64942e4
AA
1489 virtual void calc_trim_to_aggressive() = 0;
1490
7c673cae
FG
1491 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1492 pg_missing_t& omissing, pg_shard_t from);
1493 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1494 pg_missing_t& omissing, pg_shard_t from);
1495 bool proc_replica_info(
1496 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1497
1498 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1499 PG *pg;
1500 ObjectStore::Transaction *t;
1501 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1502
1503 // LogEntryHandler
1504 void remove(const hobject_t &hoid) override {
1505 pg->get_pgbackend()->remove(hoid, t);
1506 }
1507 void try_stash(const hobject_t &hoid, version_t v) override {
1508 pg->get_pgbackend()->try_stash(hoid, v, t);
1509 }
1510 void rollback(const pg_log_entry_t &entry) override {
11fdf7f2 1511 ceph_assert(entry.can_rollback());
7c673cae
FG
1512 pg->get_pgbackend()->rollback(entry, t);
1513 }
1514 void rollforward(const pg_log_entry_t &entry) override {
1515 pg->get_pgbackend()->rollforward(entry, t);
1516 }
1517 void trim(const pg_log_entry_t &entry) override {
1518 pg->get_pgbackend()->trim(entry, t);
1519 }
1520 };
1521
1522 void update_object_snap_mapping(
1523 ObjectStore::Transaction *t, const hobject_t &soid,
1524 const set<snapid_t> &snaps);
1525 void clear_object_snap_mapping(
1526 ObjectStore::Transaction *t, const hobject_t &soid);
1527 void remove_snap_mapped_object(
1528 ObjectStore::Transaction& t, const hobject_t& soid);
1529 void merge_log(
1530 ObjectStore::Transaction& t, pg_info_t &oinfo,
1531 pg_log_t &olog, pg_shard_t from);
1532 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1533 bool search_for_missing(
1534 const pg_info_t &oinfo, const pg_missing_t &omissing,
1535 pg_shard_t fromosd,
1536 RecoveryCtx*);
1537
7c673cae
FG
1538 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1539
7c673cae
FG
1540 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1541 const map<pg_shard_t, pg_info_t> &infos,
1542 bool restrict_to_up_acting,
1543 bool *history_les_bound) const;
1544 static void calc_ec_acting(
1545 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1546 unsigned size,
1547 const vector<int> &acting,
7c673cae 1548 const vector<int> &up,
7c673cae 1549 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1550 bool restrict_to_up_acting,
1551 vector<int> *want,
1552 set<pg_shard_t> *backfill,
1553 set<pg_shard_t> *acting_backfill,
7c673cae
FG
1554 ostream &ss);
1555 static void calc_replicated_acting(
1556 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
11fdf7f2 1557 uint64_t force_auth_primary_missing_objects,
7c673cae
FG
1558 unsigned size,
1559 const vector<int> &acting,
7c673cae
FG
1560 const vector<int> &up,
1561 pg_shard_t up_primary,
1562 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1563 bool restrict_to_up_acting,
1564 vector<int> *want,
1565 set<pg_shard_t> *backfill,
1566 set<pg_shard_t> *acting_backfill,
11fdf7f2 1567 const OSDMapRef osdmap,
7c673cae 1568 ostream &ss);
11fdf7f2
TL
1569 void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1570 const pg_info_t &auth_info,
1571 vector<int> *want,
81eedcae
TL
1572 set<pg_shard_t> *async_recovery,
1573 const OSDMapRef osdmap) const;
11fdf7f2
TL
1574 void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1575 const pg_info_t &auth_info,
1576 vector<int> *want,
81eedcae
TL
1577 set<pg_shard_t> *async_recovery,
1578 const OSDMapRef osdmap) const;
11fdf7f2
TL
1579
1580 bool recoverable_and_ge_min_size(const vector<int> &want) const;
7c673cae
FG
1581 bool choose_acting(pg_shard_t &auth_log_shard,
1582 bool restrict_to_up_acting,
1583 bool *history_les_bound);
1584 void build_might_have_unfound();
1585 void activate(
1586 ObjectStore::Transaction& t,
1587 epoch_t activation_epoch,
7c673cae
FG
1588 map<int, map<spg_t,pg_query_t> >& query_map,
1589 map<int,
1590 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1591 RecoveryCtx *ctx);
11fdf7f2
TL
1592
1593 struct C_PG_ActivateCommitted : public Context {
1594 PGRef pg;
1595 epoch_t epoch;
1596 epoch_t activation_epoch;
1597 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1598 : pg(p), epoch(e), activation_epoch(ae) {}
1599 void finish(int r) override {
1600 pg->_activate_committed(epoch, activation_epoch);
1601 }
1602 };
7c673cae
FG
1603 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1604 void all_activated_and_committed();
1605
1606 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1607
1608 bool have_unfound() const {
1609 return missing_loc.have_unfound();
1610 }
1611 uint64_t get_num_unfound() const {
1612 return missing_loc.num_unfound();
1613 }
81eedcae
TL
1614 bool all_missing_unfound() const {
1615 const auto& missing = pg_log.get_missing();
1616 if (!missing.have_missing())
1617 return false;
1618 for (auto& m : missing.get_items()) {
1619 if (!missing_loc.is_unfound(m.first))
1620 return false;
1621 }
1622 return true;
1623 }
7c673cae
FG
1624
1625 virtual void check_local() = 0;
1626
7c673cae
FG
1627 void purge_strays();
1628
1629 void update_heartbeat_peers();
1630
1631 Context *finish_sync_event;
1632
11fdf7f2 1633 Context *finish_recovery();
7c673cae 1634 void _finish_recovery(Context *c);
11fdf7f2
TL
1635 struct C_PG_FinishRecovery : public Context {
1636 PGRef pg;
1637 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1638 void finish(int r) override {
1639 pg->_finish_recovery(this);
1640 }
1641 };
7c673cae
FG
1642 void cancel_recovery();
1643 void clear_recovery_state();
1644 virtual void _clear_recovery_state() = 0;
1645 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1646 void start_recovery_op(const hobject_t& soid);
1647 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1648
7c673cae
FG
1649 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1650
1651 friend class C_OSD_RepModify_Commit;
11fdf7f2 1652 friend class C_DeleteMore;
7c673cae
FG
1653
1654 // -- backoff --
1655 Mutex backoff_lock; // orders inside Backoff::lock
1656 map<hobject_t,set<BackoffRef>> backoffs;
1657
1658 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1659 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1660 void release_backoffs(const hobject_t& o) {
1661 release_backoffs(o, o);
1662 }
1663 void clear_backoffs();
1664
1665 void add_pg_backoff(SessionRef s) {
1666 hobject_t begin = info.pgid.pgid.get_hobj_start();
1667 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1668 add_backoff(s, begin, end);
1669 }
1670 void release_pg_backoffs() {
1671 hobject_t begin = info.pgid.pgid.get_hobj_start();
1672 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1673 release_backoffs(begin, end);
1674 }
1675
7c673cae 1676 // -- scrub --
11fdf7f2 1677public:
7c673cae
FG
1678 struct Scrubber {
1679 Scrubber();
1680 ~Scrubber();
1681
1682 // metadata
1683 set<pg_shard_t> reserved_peers;
1684 bool reserved, reserve_failed;
1685 epoch_t epoch_start;
1686
1687 // common to both scrubs
1688 bool active;
7c673cae
FG
1689 set<pg_shard_t> waiting_on_whom;
1690 int shallow_errors;
1691 int deep_errors;
1692 int fixed;
1693 ScrubMap primary_scrubmap;
28e407b8
AA
1694 ScrubMapBuilder primary_scrubmap_pos;
1695 epoch_t replica_scrub_start = 0;
1696 ScrubMap replica_scrubmap;
1697 ScrubMapBuilder replica_scrubmap_pos;
7c673cae
FG
1698 map<pg_shard_t, ScrubMap> received_maps;
1699 OpRequestRef active_rep_scrub;
1700 utime_t scrub_reg_stamp; // stamp we registered for
1701
11fdf7f2
TL
1702 omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
1703
7c673cae
FG
1704 // For async sleep
1705 bool sleeping = false;
1706 bool needs_sleep = true;
1707 utime_t sleep_start;
1708
1709 // flags to indicate explicitly requested scrubs (by admin)
1710 bool must_scrub, must_deep_scrub, must_repair;
1711
1712 // Priority to use for scrub scheduling
11fdf7f2 1713 unsigned priority = 0;
7c673cae
FG
1714
1715 // this flag indicates whether we would like to do auto-repair of the PG or not
1716 bool auto_repair;
11fdf7f2
TL
1717 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1718 bool check_repair;
1719 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1720 // we should deep scrub in order to auto repair
1721 bool deep_scrub_on_error;
7c673cae
FG
1722
1723 // Maps from objects with errors to missing/inconsistent peers
1724 map<hobject_t, set<pg_shard_t>> missing;
1725 map<hobject_t, set<pg_shard_t>> inconsistent;
1726
1727 // Map from object with errors to good peers
1728 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1729
1730 // Cleaned map pending snap metadata scrub
1731 ScrubMap cleaned_meta_map;
1732
28e407b8
AA
1733 void clean_meta_map(ScrubMap &for_meta_scrub) {
1734 if (end.is_max() ||
1735 cleaned_meta_map.objects.empty()) {
1736 cleaned_meta_map.swap(for_meta_scrub);
1737 } else {
1738 auto iter = cleaned_meta_map.objects.end();
1739 --iter; // not empty, see if clause
1740 auto begin = cleaned_meta_map.objects.begin();
1741 if (iter->first.has_snapset()) {
1742 ++iter;
1743 } else {
1744 while (iter != begin) {
1745 auto next = iter--;
1746 if (next->first.get_head() != iter->first.get_head()) {
1747 ++iter;
1748 break;
1749 }
1750 }
1751 }
1752 for_meta_scrub.objects.insert(begin, iter);
1753 cleaned_meta_map.objects.erase(begin, iter);
1754 }
1755 }
1756
7c673cae
FG
1757 // digest updates which we are waiting on
1758 int num_digest_updates_pending;
1759
1760 // chunky scrub
28e407b8
AA
1761 hobject_t start, end; // [start,end)
1762 hobject_t max_end; // Largest end that may have been sent to replicas
7c673cae
FG
1763 eversion_t subset_last_update;
1764
1765 // chunky scrub state
1766 enum State {
1767 INACTIVE,
1768 NEW_CHUNK,
1769 WAIT_PUSHES,
1770 WAIT_LAST_UPDATE,
1771 BUILD_MAP,
28e407b8 1772 BUILD_MAP_DONE,
7c673cae
FG
1773 WAIT_REPLICAS,
1774 COMPARE_MAPS,
1775 WAIT_DIGEST_UPDATES,
1776 FINISH,
28e407b8 1777 BUILD_MAP_REPLICA,
7c673cae
FG
1778 } state;
1779
1780 std::unique_ptr<Scrub::Store> store;
1781 // deep scrub
1782 bool deep;
28e407b8
AA
1783 int preempt_left;
1784 int preempt_divisor;
7c673cae
FG
1785
1786 list<Context*> callbacks;
1787 void add_callback(Context *context) {
1788 callbacks.push_back(context);
1789 }
1790 void run_callbacks() {
1791 list<Context*> to_run;
1792 to_run.swap(callbacks);
1793 for (list<Context*>::iterator i = to_run.begin();
1794 i != to_run.end();
1795 ++i) {
1796 (*i)->complete(0);
1797 }
1798 }
1799
1800 static const char *state_string(const PG::Scrubber::State& state) {
1801 const char *ret = NULL;
1802 switch( state )
1803 {
1804 case INACTIVE: ret = "INACTIVE"; break;
1805 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1806 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1807 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1808 case BUILD_MAP: ret = "BUILD_MAP"; break;
28e407b8 1809 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
7c673cae
FG
1810 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1811 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1812 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1813 case FINISH: ret = "FINISH"; break;
28e407b8 1814 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
7c673cae
FG
1815 }
1816 return ret;
1817 }
1818
1819 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1820
7c673cae
FG
1821 // clear all state
1822 void reset() {
1823 active = false;
7c673cae
FG
1824 waiting_on_whom.clear();
1825 if (active_rep_scrub) {
1826 active_rep_scrub = OpRequestRef();
1827 }
1828 received_maps.clear();
1829
1830 must_scrub = false;
1831 must_deep_scrub = false;
1832 must_repair = false;
1833 auto_repair = false;
11fdf7f2
TL
1834 check_repair = false;
1835 deep_scrub_on_error = false;
7c673cae
FG
1836
1837 state = PG::Scrubber::INACTIVE;
1838 start = hobject_t();
1839 end = hobject_t();
28e407b8 1840 max_end = hobject_t();
7c673cae
FG
1841 subset_last_update = eversion_t();
1842 shallow_errors = 0;
1843 deep_errors = 0;
1844 fixed = 0;
11fdf7f2 1845 omap_stats = (const struct omap_stat_t){ 0 };
7c673cae 1846 deep = false;
7c673cae
FG
1847 run_callbacks();
1848 inconsistent.clear();
1849 missing.clear();
1850 authoritative.clear();
1851 num_digest_updates_pending = 0;
28e407b8
AA
1852 primary_scrubmap = ScrubMap();
1853 primary_scrubmap_pos.reset();
1854 replica_scrubmap = ScrubMap();
1855 replica_scrubmap_pos.reset();
7c673cae
FG
1856 cleaned_meta_map = ScrubMap();
1857 sleeping = false;
1858 needs_sleep = true;
1859 sleep_start = utime_t();
1860 }
1861
1862 void create_results(const hobject_t& obj);
1863 void cleanup_store(ObjectStore::Transaction *t);
1864 } scrubber;
1865
11fdf7f2 1866protected:
7c673cae
FG
1867 bool scrub_after_recovery;
1868
1869 int active_pushes;
1870
28e407b8
AA
1871 bool scrub_can_preempt = false;
1872 bool scrub_preempted = false;
1873
1874 // we allow some number of preemptions of the scrub, which mean we do
1875 // not block. then we start to block. once we start blocking, we do
1876 // not stop until the scrub range is completed.
1877 bool write_blocked_by_scrub(const hobject_t &soid);
1878
1879 /// true if the given range intersects the scrub interval in any way
1880 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1881
7c673cae
FG
1882 void repair_object(
1883 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1884 pg_shard_t bad_peer);
1885
7c673cae
FG
1886 void chunky_scrub(ThreadPool::TPHandle &handle);
1887 void scrub_compare_maps();
1888 /**
1889 * return true if any inconsistency/missing is repaired, false otherwise
1890 */
1891 bool scrub_process_inconsistent();
31f18b77 1892 bool ops_blocked_by_scrub() const;
7c673cae 1893 void scrub_finish();
11fdf7f2 1894 void scrub_clear_state(bool keep_repair = false);
7c673cae 1895 void _scan_snaps(ScrubMap &map);
224ce89b 1896 void _repair_oinfo_oid(ScrubMap &map);
11fdf7f2 1897 void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
7c673cae
FG
1898 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1899 hobject_t start, hobject_t end, bool deep,
28e407b8 1900 bool allow_preemption);
7c673cae
FG
1901 int build_scrub_map_chunk(
1902 ScrubMap &map,
28e407b8
AA
1903 ScrubMapBuilder &pos,
1904 hobject_t start, hobject_t end, bool deep,
7c673cae
FG
1905 ThreadPool::TPHandle &handle);
1906 /**
1907 * returns true if [begin, end) is good to scrub at this time
1908 * a false return value obliges the implementer to requeue scrub when the
1909 * condition preventing scrub clears
1910 */
1911 virtual bool _range_available_for_scrub(
1912 const hobject_t &begin, const hobject_t &end) = 0;
1913 virtual void scrub_snapshot_metadata(
1914 ScrubMap &map,
28e407b8
AA
1915 const std::map<hobject_t,
1916 pair<boost::optional<uint32_t>,
1917 boost::optional<uint32_t>>> &missing_digest) { }
7c673cae
FG
1918 virtual void _scrub_clear_state() { }
1919 virtual void _scrub_finish() { }
7c673cae
FG
1920 void clear_scrub_reserved();
1921 void scrub_reserve_replicas();
1922 void scrub_unreserve_replicas();
1923 bool scrub_all_replicas_reserved() const;
7c673cae
FG
1924
1925 void replica_scrub(
1926 OpRequestRef op,
1927 ThreadPool::TPHandle &handle);
1928 void do_replica_scrub_map(OpRequestRef op);
7c673cae
FG
1929
1930 void handle_scrub_reserve_request(OpRequestRef op);
1931 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1932 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1933 void handle_scrub_reserve_release(OpRequestRef op);
1934
1935 void reject_reservation();
3efd9988
FG
1936 void schedule_backfill_retry(float retry);
1937 void schedule_recovery_retry(float retry);
7c673cae
FG
1938
1939 // -- recovery state --
1940
1941 template <class EVT>
1942 struct QueuePeeringEvt : Context {
1943 PGRef pg;
1944 epoch_t epoch;
1945 EVT evt;
1946 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1947 pg(pg), epoch(epoch), evt(evt) {}
1948 void finish(int r) override {
1949 pg->lock();
11fdf7f2
TL
1950 pg->queue_peering_event(PGPeeringEventRef(
1951 new PGPeeringEvent(
7c673cae
FG
1952 epoch,
1953 epoch,
1954 evt)));
1955 pg->unlock();
1956 }
1957 };
1958
7c673cae
FG
1959
1960 struct QueryState : boost::statechart::event< QueryState > {
1961 Formatter *f;
1962 explicit QueryState(Formatter *f) : f(f) {}
1963 void print(std::ostream *out) const {
1964 *out << "Query";
1965 }
1966 };
1967
11fdf7f2
TL
1968public:
1969 int pg_stat_adjust(osd_stat_t *new_stat);
1970protected:
7c673cae
FG
1971
1972 struct AdvMap : boost::statechart::event< AdvMap > {
1973 OSDMapRef osdmap;
1974 OSDMapRef lastmap;
1975 vector<int> newup, newacting;
1976 int up_primary, acting_primary;
1977 AdvMap(
1978 OSDMapRef osdmap, OSDMapRef lastmap,
1979 vector<int>& newup, int up_primary,
1980 vector<int>& newacting, int acting_primary):
1981 osdmap(osdmap), lastmap(lastmap),
1982 newup(newup),
1983 newacting(newacting),
1984 up_primary(up_primary),
1985 acting_primary(acting_primary) {}
1986 void print(std::ostream *out) const {
1987 *out << "AdvMap";
1988 }
1989 };
1990
1991 struct ActMap : boost::statechart::event< ActMap > {
1992 ActMap() : boost::statechart::event< ActMap >() {}
1993 void print(std::ostream *out) const {
1994 *out << "ActMap";
1995 }
1996 };
1997 struct Activate : boost::statechart::event< Activate > {
1998 epoch_t activation_epoch;
1999 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2000 activation_epoch(q) {}
2001 void print(std::ostream *out) const {
2002 *out << "Activate from " << activation_epoch;
2003 }
2004 };
11fdf7f2 2005public:
b32b8144
FG
2006 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2007 explicit UnfoundBackfill() {}
2008 void print(std::ostream *out) const {
2009 *out << "UnfoundBackfill";
2010 }
2011 };
2012 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2013 explicit UnfoundRecovery() {}
2014 void print(std::ostream *out) const {
2015 *out << "UnfoundRecovery";
2016 }
2017 };
11fdf7f2
TL
2018
2019 struct RequestScrub : boost::statechart::event<RequestScrub> {
2020 bool deep;
2021 bool repair;
2022 explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2023 void print(std::ostream *out) const {
2024 *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2025 << (repair ? " repair" : "");
2026 }
2027 };
2028
b32b8144 2029protected:
7c673cae 2030 TrivialEvent(Initialize)
7c673cae
FG
2031 TrivialEvent(GotInfo)
2032 TrivialEvent(NeedUpThru)
7c673cae
FG
2033 TrivialEvent(Backfilled)
2034 TrivialEvent(LocalBackfillReserved)
3efd9988 2035 TrivialEvent(RejectRemoteReservation)
11fdf7f2 2036 public:
7c673cae 2037 TrivialEvent(RequestBackfill)
11fdf7f2
TL
2038 protected:
2039 TrivialEvent(RemoteRecoveryPreempted)
2040 TrivialEvent(RemoteBackfillPreempted)
7c673cae
FG
2041 TrivialEvent(BackfillTooFull)
2042 TrivialEvent(RecoveryTooFull)
3efd9988
FG
2043
2044 TrivialEvent(MakePrimary)
2045 TrivialEvent(MakeStray)
2046 TrivialEvent(NeedActingChange)
2047 TrivialEvent(IsIncomplete)
2048 TrivialEvent(IsDown)
7c673cae
FG
2049
2050 TrivialEvent(AllReplicasRecovered)
2051 TrivialEvent(DoRecovery)
2052 TrivialEvent(LocalRecoveryReserved)
11fdf7f2
TL
2053 public:
2054 protected:
7c673cae
FG
2055 TrivialEvent(AllRemotesReserved)
2056 TrivialEvent(AllBackfillsReserved)
2057 TrivialEvent(GoClean)
2058
2059 TrivialEvent(AllReplicasActivated)
2060
2061 TrivialEvent(IntervalFlush)
2062
11fdf7f2
TL
2063 public:
2064 TrivialEvent(DeleteStart)
2065 TrivialEvent(DeleteSome)
2066
2067 TrivialEvent(SetForceRecovery)
2068 TrivialEvent(UnsetForceRecovery)
2069 TrivialEvent(SetForceBackfill)
2070 TrivialEvent(UnsetForceBackfill)
2071
2072 protected:
2073 TrivialEvent(DeleteReserved)
2074 TrivialEvent(DeleteInterrupted)
2075
7c673cae
FG
2076 /* Encapsulates PG recovery process */
2077 class RecoveryState {
2078 void start_handle(RecoveryCtx *new_ctx);
2079 void end_handle();
2080 public:
2081 void begin_block_outgoing();
2082 void end_block_outgoing();
2083 void clear_blocked_outgoing();
2084 private:
2085
2086 /* States */
2087 struct Initial;
2088 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2089 RecoveryState *state;
2090 public:
2091 PG *pg;
2092
2093 utime_t event_time;
2094 uint64_t event_count;
2095
2096 void clear_event_counters() {
2097 event_time = utime_t();
2098 event_count = 0;
2099 }
2100
2101 void log_enter(const char *state_name);
2102 void log_exit(const char *state_name, utime_t duration);
2103
2104 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2105
2106 /* Accessor functions for state methods */
2107 ObjectStore::Transaction* get_cur_transaction() {
11fdf7f2
TL
2108 ceph_assert(state->rctx);
2109 ceph_assert(state->rctx->transaction);
7c673cae
FG
2110 return state->rctx->transaction;
2111 }
2112
2113 void send_query(pg_shard_t to, const pg_query_t &query) {
11fdf7f2
TL
2114 ceph_assert(state->rctx);
2115 ceph_assert(state->rctx->query_map);
7c673cae
FG
2116 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2117 query;
2118 }
2119
2120 map<int, map<spg_t, pg_query_t> > *get_query_map() {
11fdf7f2
TL
2121 ceph_assert(state->rctx);
2122 ceph_assert(state->rctx->query_map);
7c673cae
FG
2123 return state->rctx->query_map;
2124 }
2125
2126 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
11fdf7f2
TL
2127 ceph_assert(state->rctx);
2128 ceph_assert(state->rctx->info_map);
7c673cae
FG
2129 return state->rctx->info_map;
2130 }
2131
7c673cae
FG
2132 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2133
2134 void send_notify(pg_shard_t to,
2135 const pg_notify_t &info, const PastIntervals &pi) {
11fdf7f2 2136 ceph_assert(state->rctx);
1adf2230 2137 state->rctx->send_notify(to, info, pi);
7c673cae
FG
2138 }
2139 };
2140 friend class RecoveryMachine;
2141
2142 /* States */
b32b8144
FG
2143 // Initial
2144 // Reset
2145 // Start
2146 // Started
2147 // Primary
2148 // WaitActingChange
2149 // Peering
2150 // GetInfo
2151 // GetLog
2152 // GetMissing
2153 // WaitUpThru
2154 // Incomplete
2155 // Active
2156 // Activating
2157 // Clean
2158 // Recovered
2159 // Backfilling
2160 // WaitRemoteBackfillReserved
2161 // WaitLocalBackfillReserved
2162 // NotBackfilling
2163 // NotRecovering
2164 // Recovering
2165 // WaitRemoteRecoveryReserved
2166 // WaitLocalRecoveryReserved
2167 // ReplicaActive
2168 // RepNotRecovering
2169 // RepRecovering
2170 // RepWaitBackfillReserved
2171 // RepWaitRecoveryReserved
2172 // Stray
11fdf7f2
TL
2173 // ToDelete
2174 // WaitDeleteReserved
2175 // Deleting
2176 // Crashed
7c673cae
FG
2177
2178 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2179 explicit Crashed(my_context ctx);
2180 };
2181
2182 struct Reset;
2183
2184 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2185 explicit Initial(my_context ctx);
2186 void exit();
2187
2188 typedef boost::mpl::list <
2189 boost::statechart::transition< Initialize, Reset >,
7c673cae
FG
2190 boost::statechart::custom_reaction< NullEvt >,
2191 boost::statechart::transition< boost::statechart::event_base, Crashed >
2192 > reactions;
2193
7c673cae
FG
2194 boost::statechart::result react(const MNotifyRec&);
2195 boost::statechart::result react(const MInfoRec&);
2196 boost::statechart::result react(const MLogRec&);
2197 boost::statechart::result react(const boost::statechart::event_base&) {
2198 return discard_event();
2199 }
2200 };
2201
2202 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2203 explicit Reset(my_context ctx);
2204 void exit();
2205
2206 typedef boost::mpl::list <
2207 boost::statechart::custom_reaction< QueryState >,
2208 boost::statechart::custom_reaction< AdvMap >,
2209 boost::statechart::custom_reaction< ActMap >,
2210 boost::statechart::custom_reaction< NullEvt >,
7c673cae
FG
2211 boost::statechart::custom_reaction< IntervalFlush >,
2212 boost::statechart::transition< boost::statechart::event_base, Crashed >
2213 > reactions;
2214 boost::statechart::result react(const QueryState& q);
2215 boost::statechart::result react(const AdvMap&);
2216 boost::statechart::result react(const ActMap&);
7c673cae
FG
2217 boost::statechart::result react(const IntervalFlush&);
2218 boost::statechart::result react(const boost::statechart::event_base&) {
2219 return discard_event();
2220 }
2221 };
2222
2223 struct Start;
2224
2225 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2226 explicit Started(my_context ctx);
2227 void exit();
2228
2229 typedef boost::mpl::list <
2230 boost::statechart::custom_reaction< QueryState >,
2231 boost::statechart::custom_reaction< AdvMap >,
7c673cae 2232 boost::statechart::custom_reaction< IntervalFlush >,
11fdf7f2
TL
2233 // ignored
2234 boost::statechart::custom_reaction< NullEvt >,
2235 boost::statechart::custom_reaction<SetForceRecovery>,
2236 boost::statechart::custom_reaction<UnsetForceRecovery>,
2237 boost::statechart::custom_reaction<SetForceBackfill>,
2238 boost::statechart::custom_reaction<UnsetForceBackfill>,
2239 boost::statechart::custom_reaction<RequestScrub>,
2240 // crash
7c673cae
FG
2241 boost::statechart::transition< boost::statechart::event_base, Crashed >
2242 > reactions;
2243 boost::statechart::result react(const QueryState& q);
2244 boost::statechart::result react(const AdvMap&);
7c673cae
FG
2245 boost::statechart::result react(const IntervalFlush&);
2246 boost::statechart::result react(const boost::statechart::event_base&) {
2247 return discard_event();
2248 }
2249 };
2250
7c673cae
FG
2251 struct Primary;
2252 struct Stray;
2253
2254 struct Start : boost::statechart::state< Start, Started >, NamedState {
2255 explicit Start(my_context ctx);
2256 void exit();
2257
2258 typedef boost::mpl::list <
2259 boost::statechart::transition< MakePrimary, Primary >,
2260 boost::statechart::transition< MakeStray, Stray >
2261 > reactions;
2262 };
2263
2264 struct Peering;
2265 struct WaitActingChange;
7c673cae 2266 struct Incomplete;
7c673cae 2267 struct Down;
7c673cae
FG
2268
2269 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2270 explicit Primary(my_context ctx);
2271 void exit();
2272
2273 typedef boost::mpl::list <
2274 boost::statechart::custom_reaction< ActMap >,
2275 boost::statechart::custom_reaction< MNotifyRec >,
11fdf7f2
TL
2276 boost::statechart::transition< NeedActingChange, WaitActingChange >,
2277 boost::statechart::custom_reaction<SetForceRecovery>,
2278 boost::statechart::custom_reaction<UnsetForceRecovery>,
2279 boost::statechart::custom_reaction<SetForceBackfill>,
2280 boost::statechart::custom_reaction<UnsetForceBackfill>,
2281 boost::statechart::custom_reaction<RequestScrub>
7c673cae
FG
2282 > reactions;
2283 boost::statechart::result react(const ActMap&);
2284 boost::statechart::result react(const MNotifyRec&);
11fdf7f2
TL
2285 boost::statechart::result react(const SetForceRecovery&);
2286 boost::statechart::result react(const UnsetForceRecovery&);
2287 boost::statechart::result react(const SetForceBackfill&);
2288 boost::statechart::result react(const UnsetForceBackfill&);
2289 boost::statechart::result react(const RequestScrub&);
7c673cae
FG
2290 };
2291
2292 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2293 NamedState {
2294 typedef boost::mpl::list <
2295 boost::statechart::custom_reaction< QueryState >,
2296 boost::statechart::custom_reaction< AdvMap >,
2297 boost::statechart::custom_reaction< MLogRec >,
2298 boost::statechart::custom_reaction< MInfoRec >,
2299 boost::statechart::custom_reaction< MNotifyRec >
2300 > reactions;
2301 explicit WaitActingChange(my_context ctx);
2302 boost::statechart::result react(const QueryState& q);
2303 boost::statechart::result react(const AdvMap&);
2304 boost::statechart::result react(const MLogRec&);
2305 boost::statechart::result react(const MInfoRec&);
2306 boost::statechart::result react(const MNotifyRec&);
2307 void exit();
2308 };
2309
2310 struct GetInfo;
2311 struct Active;
2312
2313 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2314 PastIntervals::PriorSet prior_set;
2315 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2316
2317 explicit Peering(my_context ctx);
2318 void exit();
2319
2320 typedef boost::mpl::list <
2321 boost::statechart::custom_reaction< QueryState >,
2322 boost::statechart::transition< Activate, Active >,
2323 boost::statechart::custom_reaction< AdvMap >
2324 > reactions;
2325 boost::statechart::result react(const QueryState& q);
2326 boost::statechart::result react(const AdvMap &advmap);
2327 };
2328
2329 struct WaitLocalRecoveryReserved;
2330 struct Activating;
2331 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2332 explicit Active(my_context ctx);
2333 void exit();
2334
2335 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2336 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2337 bool all_replicas_activated;
2338
2339 typedef boost::mpl::list <
2340 boost::statechart::custom_reaction< QueryState >,
2341 boost::statechart::custom_reaction< ActMap >,
2342 boost::statechart::custom_reaction< AdvMap >,
2343 boost::statechart::custom_reaction< MInfoRec >,
2344 boost::statechart::custom_reaction< MNotifyRec >,
2345 boost::statechart::custom_reaction< MLogRec >,
11fdf7f2 2346 boost::statechart::custom_reaction< MTrim >,
7c673cae 2347 boost::statechart::custom_reaction< Backfilled >,
3efd9988
FG
2348 boost::statechart::custom_reaction< AllReplicasActivated >,
2349 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2350 boost::statechart::custom_reaction< DeferBackfill >,
11fdf7f2 2351 boost::statechart::custom_reaction< UnfoundRecovery >,
b32b8144 2352 boost::statechart::custom_reaction< UnfoundBackfill >,
11fdf7f2
TL
2353 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2354 boost::statechart::custom_reaction< RemoteReservationRevoked>,
b32b8144 2355 boost::statechart::custom_reaction< DoRecovery>
7c673cae
FG
2356 > reactions;
2357 boost::statechart::result react(const QueryState& q);
2358 boost::statechart::result react(const ActMap&);
2359 boost::statechart::result react(const AdvMap&);
2360 boost::statechart::result react(const MInfoRec& infoevt);
2361 boost::statechart::result react(const MNotifyRec& notevt);
2362 boost::statechart::result react(const MLogRec& logevt);
11fdf7f2 2363 boost::statechart::result react(const MTrim& trimevt);
7c673cae
FG
2364 boost::statechart::result react(const Backfilled&) {
2365 return discard_event();
2366 }
2367 boost::statechart::result react(const AllReplicasActivated&);
3efd9988
FG
2368 boost::statechart::result react(const DeferRecovery& evt) {
2369 return discard_event();
2370 }
2371 boost::statechart::result react(const DeferBackfill& evt) {
2372 return discard_event();
2373 }
b32b8144 2374 boost::statechart::result react(const UnfoundRecovery& evt) {
11fdf7f2 2375 return discard_event();
b32b8144
FG
2376 }
2377 boost::statechart::result react(const UnfoundBackfill& evt) {
11fdf7f2
TL
2378 return discard_event();
2379 }
2380 boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2381 return discard_event();
2382 }
2383 boost::statechart::result react(const RemoteReservationRevoked&) {
2384 return discard_event();
b32b8144
FG
2385 }
2386 boost::statechart::result react(const DoRecovery&) {
2387 return discard_event();
2388 }
7c673cae
FG
2389 };
2390
2391 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2392 typedef boost::mpl::list<
11fdf7f2
TL
2393 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2394 boost::statechart::custom_reaction<SetForceRecovery>,
2395 boost::statechart::custom_reaction<SetForceBackfill>
7c673cae
FG
2396 > reactions;
2397 explicit Clean(my_context ctx);
2398 void exit();
11fdf7f2
TL
2399 boost::statechart::result react(const boost::statechart::event_base&) {
2400 return discard_event();
2401 }
7c673cae
FG
2402 };
2403
2404 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2405 typedef boost::mpl::list<
2406 boost::statechart::transition< GoClean, Clean >,
224ce89b 2407 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
7c673cae
FG
2408 boost::statechart::custom_reaction< AllReplicasActivated >
2409 > reactions;
2410 explicit Recovered(my_context ctx);
2411 void exit();
2412 boost::statechart::result react(const AllReplicasActivated&) {
2413 post_event(GoClean());
2414 return forward_event();
2415 }
2416 };
2417
2418 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2419 typedef boost::mpl::list<
11fdf7f2 2420 boost::statechart::custom_reaction< Backfilled >,
3efd9988 2421 boost::statechart::custom_reaction< DeferBackfill >,
b32b8144 2422 boost::statechart::custom_reaction< UnfoundBackfill >,
11fdf7f2
TL
2423 boost::statechart::custom_reaction< RemoteReservationRejected >,
2424 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2425 boost::statechart::custom_reaction< RemoteReservationRevoked>
7c673cae
FG
2426 > reactions;
2427 explicit Backfilling(my_context ctx);
11fdf7f2
TL
2428 boost::statechart::result react(const RemoteReservationRejected& evt) {
2429 // for compat with old peers
2430 post_event(RemoteReservationRevokedTooFull());
2431 return discard_event();
2432 }
2433 void backfill_release_reservations();
2434 boost::statechart::result react(const Backfilled& evt);
2435 boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2436 boost::statechart::result react(const RemoteReservationRevoked& evt);
3efd9988 2437 boost::statechart::result react(const DeferBackfill& evt);
b32b8144 2438 boost::statechart::result react(const UnfoundBackfill& evt);
11fdf7f2 2439 void cancel_backfill();
7c673cae
FG
2440 void exit();
2441 };
2442
2443 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2444 typedef boost::mpl::list<
2445 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2446 boost::statechart::custom_reaction< RemoteReservationRejected >,
11fdf7f2 2447 boost::statechart::custom_reaction< RemoteReservationRevoked >,
7c673cae
FG
2448 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2449 > reactions;
2450 set<pg_shard_t>::const_iterator backfill_osd_it;
2451 explicit WaitRemoteBackfillReserved(my_context ctx);
11fdf7f2 2452 void retry();
7c673cae
FG
2453 void exit();
2454 boost::statechart::result react(const RemoteBackfillReserved& evt);
2455 boost::statechart::result react(const RemoteReservationRejected& evt);
11fdf7f2 2456 boost::statechart::result react(const RemoteReservationRevoked& evt);
7c673cae
FG
2457 };
2458
2459 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2460 typedef boost::mpl::list<
2461 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2462 > reactions;
2463 explicit WaitLocalBackfillReserved(my_context ctx);
2464 void exit();
2465 };
2466
2467 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2468 typedef boost::mpl::list<
2469 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2470 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2471 boost::statechart::custom_reaction< RemoteReservationRejected >
2472 > reactions;
2473 explicit NotBackfilling(my_context ctx);
2474 void exit();
2475 boost::statechart::result react(const RemoteBackfillReserved& evt);
2476 boost::statechart::result react(const RemoteReservationRejected& evt);
2477 };
2478
2479 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2480 typedef boost::mpl::list<
c07f9fc5 2481 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
b32b8144
FG
2482 boost::statechart::custom_reaction< DeferRecovery >,
2483 boost::statechart::custom_reaction< UnfoundRecovery >
7c673cae
FG
2484 > reactions;
2485 explicit NotRecovering(my_context ctx);
3efd9988 2486 boost::statechart::result react(const DeferRecovery& evt) {
c07f9fc5
FG
2487 /* no-op */
2488 return discard_event();
2489 }
b32b8144
FG
2490 boost::statechart::result react(const UnfoundRecovery& evt) {
2491 /* no-op */
2492 return discard_event();
2493 }
7c673cae
FG
2494 void exit();
2495 };
2496
11fdf7f2 2497 struct ToDelete;
7c673cae
FG
2498 struct RepNotRecovering;
2499 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2500 explicit ReplicaActive(my_context ctx);
2501 void exit();
2502
2503 typedef boost::mpl::list <
2504 boost::statechart::custom_reaction< QueryState >,
2505 boost::statechart::custom_reaction< ActMap >,
2506 boost::statechart::custom_reaction< MQuery >,
2507 boost::statechart::custom_reaction< MInfoRec >,
2508 boost::statechart::custom_reaction< MLogRec >,
11fdf7f2 2509 boost::statechart::custom_reaction< MTrim >,
3efd9988
FG
2510 boost::statechart::custom_reaction< Activate >,
2511 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
2512 boost::statechart::custom_reaction< DeferBackfill >,
2513 boost::statechart::custom_reaction< UnfoundRecovery >,
11fdf7f2
TL
2514 boost::statechart::custom_reaction< UnfoundBackfill >,
2515 boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2516 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2517 boost::statechart::custom_reaction< RecoveryDone >,
2518 boost::statechart::transition<DeleteStart, ToDelete>
7c673cae
FG
2519 > reactions;
2520 boost::statechart::result react(const QueryState& q);
2521 boost::statechart::result react(const MInfoRec& infoevt);
2522 boost::statechart::result react(const MLogRec& logevt);
11fdf7f2 2523 boost::statechart::result react(const MTrim& trimevt);
7c673cae
FG
2524 boost::statechart::result react(const ActMap&);
2525 boost::statechart::result react(const MQuery&);
2526 boost::statechart::result react(const Activate&);
11fdf7f2
TL
2527 boost::statechart::result react(const RecoveryDone&) {
2528 return discard_event();
2529 }
3efd9988
FG
2530 boost::statechart::result react(const DeferRecovery& evt) {
2531 return discard_event();
2532 }
2533 boost::statechart::result react(const DeferBackfill& evt) {
2534 return discard_event();
2535 }
b32b8144
FG
2536 boost::statechart::result react(const UnfoundRecovery& evt) {
2537 return discard_event();
2538 }
2539 boost::statechart::result react(const UnfoundBackfill& evt) {
2540 return discard_event();
2541 }
11fdf7f2
TL
2542 boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2543 return discard_event();
2544 }
2545 boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2546 return discard_event();
2547 }
7c673cae
FG
2548 };
2549
2550 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2551 typedef boost::mpl::list<
2552 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
3efd9988 2553 // for compat with old peers
7c673cae 2554 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
3efd9988 2555 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
11fdf7f2
TL
2556 boost::statechart::custom_reaction< BackfillTooFull >,
2557 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2558 boost::statechart::custom_reaction< RemoteBackfillPreempted >
7c673cae
FG
2559 > reactions;
2560 explicit RepRecovering(my_context ctx);
11fdf7f2 2561 boost::statechart::result react(const RemoteRecoveryPreempted &evt);
7c673cae 2562 boost::statechart::result react(const BackfillTooFull &evt);
11fdf7f2 2563 boost::statechart::result react(const RemoteBackfillPreempted &evt);
7c673cae
FG
2564 void exit();
2565 };
2566
2567 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2568 typedef boost::mpl::list<
2569 boost::statechart::custom_reaction< RemoteBackfillReserved >,
3efd9988
FG
2570 boost::statechart::custom_reaction< RejectRemoteReservation >,
2571 boost::statechart::custom_reaction< RemoteReservationRejected >,
2572 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2573 > reactions;
2574 explicit RepWaitBackfillReserved(my_context ctx);
2575 void exit();
2576 boost::statechart::result react(const RemoteBackfillReserved &evt);
3efd9988 2577 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae 2578 boost::statechart::result react(const RemoteReservationRejected &evt);
3efd9988 2579 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2580 };
2581
2582 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2583 typedef boost::mpl::list<
3efd9988
FG
2584 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2585 // for compat with old peers
2586 boost::statechart::custom_reaction< RemoteReservationRejected >,
2587 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2588 > reactions;
2589 explicit RepWaitRecoveryReserved(my_context ctx);
2590 void exit();
2591 boost::statechart::result react(const RemoteRecoveryReserved &evt);
3efd9988
FG
2592 boost::statechart::result react(const RemoteReservationRejected &evt) {
2593 // for compat with old peers
2594 post_event(RemoteReservationCanceled());
2595 return discard_event();
2596 }
2597 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2598 };
2599
2600 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2601 typedef boost::mpl::list<
11fdf7f2 2602 boost::statechart::custom_reaction< RequestRecoveryPrio >,
7c673cae 2603 boost::statechart::custom_reaction< RequestBackfillPrio >,
3efd9988
FG
2604 boost::statechart::custom_reaction< RejectRemoteReservation >,
2605 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2606 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
11fdf7f2
TL
2607 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2608 boost::statechart::custom_reaction< RemoteBackfillReserved >,
7c673cae
FG
2609 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2610 > reactions;
2611 explicit RepNotRecovering(my_context ctx);
11fdf7f2 2612 boost::statechart::result react(const RequestRecoveryPrio &evt);
7c673cae 2613 boost::statechart::result react(const RequestBackfillPrio &evt);
11fdf7f2
TL
2614 boost::statechart::result react(const RemoteBackfillReserved &evt) {
2615 // my reservation completion raced with a RELEASE from primary
2616 return discard_event();
2617 }
2618 boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2619 // my reservation completion raced with a RELEASE from primary
2620 return discard_event();
2621 }
3efd9988 2622 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae
FG
2623 void exit();
2624 };
2625
2626 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2627 typedef boost::mpl::list <
2628 boost::statechart::custom_reaction< AllReplicasRecovered >,
3efd9988 2629 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2630 boost::statechart::custom_reaction< UnfoundRecovery >,
7c673cae
FG
2631 boost::statechart::custom_reaction< RequestBackfill >
2632 > reactions;
2633 explicit Recovering(my_context ctx);
2634 void exit();
224ce89b 2635 void release_reservations(bool cancel = false);
7c673cae 2636 boost::statechart::result react(const AllReplicasRecovered &evt);
3efd9988 2637 boost::statechart::result react(const DeferRecovery& evt);
b32b8144 2638 boost::statechart::result react(const UnfoundRecovery& evt);
7c673cae
FG
2639 boost::statechart::result react(const RequestBackfill &evt);
2640 };
2641
2642 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2643 typedef boost::mpl::list <
2644 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2645 boost::statechart::transition< AllRemotesReserved, Recovering >
2646 > reactions;
2647 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2648 explicit WaitRemoteRecoveryReserved(my_context ctx);
2649 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2650 void exit();
2651 };
2652
2653 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2654 typedef boost::mpl::list <
2655 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2656 boost::statechart::custom_reaction< RecoveryTooFull >
2657 > reactions;
2658 explicit WaitLocalRecoveryReserved(my_context ctx);
2659 void exit();
2660 boost::statechart::result react(const RecoveryTooFull &evt);
2661 };
2662
2663 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2664 typedef boost::mpl::list <
2665 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2666 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2667 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2668 > reactions;
2669 explicit Activating(my_context ctx);
2670 void exit();
2671 };
2672
11fdf7f2
TL
2673 struct Stray : boost::statechart::state< Stray, Started >,
2674 NamedState {
7c673cae
FG
2675 explicit Stray(my_context ctx);
2676 void exit();
2677
2678 typedef boost::mpl::list <
2679 boost::statechart::custom_reaction< MQuery >,
2680 boost::statechart::custom_reaction< MLogRec >,
2681 boost::statechart::custom_reaction< MInfoRec >,
2682 boost::statechart::custom_reaction< ActMap >,
11fdf7f2
TL
2683 boost::statechart::custom_reaction< RecoveryDone >,
2684 boost::statechart::transition<DeleteStart, ToDelete>
7c673cae
FG
2685 > reactions;
2686 boost::statechart::result react(const MQuery& query);
2687 boost::statechart::result react(const MLogRec& logevt);
2688 boost::statechart::result react(const MInfoRec& infoevt);
2689 boost::statechart::result react(const ActMap&);
2690 boost::statechart::result react(const RecoveryDone&) {
2691 return discard_event();
2692 }
2693 };
2694
11fdf7f2
TL
2695 struct WaitDeleteReserved;
2696 struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2697 unsigned priority = 0;
2698 typedef boost::mpl::list <
2699 boost::statechart::custom_reaction< ActMap >,
2700 boost::statechart::custom_reaction< DeleteSome >
2701 > reactions;
2702 explicit ToDelete(my_context ctx);
2703 boost::statechart::result react(const ActMap &evt);
2704 boost::statechart::result react(const DeleteSome &evt) {
2705 // happens if we drop out of Deleting due to reprioritization etc.
2706 return discard_event();
2707 }
2708 void exit();
2709 };
2710
2711 struct Deleting;
2712 struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2713 ToDelete>, NamedState {
2714 typedef boost::mpl::list <
2715 boost::statechart::transition<DeleteReserved, Deleting>
2716 > reactions;
2717 explicit WaitDeleteReserved(my_context ctx);
2718 void exit();
2719 };
2720
2721 struct Deleting : boost::statechart::state<Deleting,
2722 ToDelete>, NamedState {
2723 typedef boost::mpl::list <
2724 boost::statechart::custom_reaction< DeleteSome >,
2725 boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2726 > reactions;
2727 explicit Deleting(my_context ctx);
2728 boost::statechart::result react(const DeleteSome &evt);
2729 void exit();
2730 };
2731
7c673cae
FG
2732 struct GetLog;
2733
2734 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2735 set<pg_shard_t> peer_info_requested;
2736
2737 explicit GetInfo(my_context ctx);
2738 void exit();
2739 void get_infos();
2740
2741 typedef boost::mpl::list <
2742 boost::statechart::custom_reaction< QueryState >,
2743 boost::statechart::transition< GotInfo, GetLog >,
2744 boost::statechart::custom_reaction< MNotifyRec >,
2745 boost::statechart::transition< IsDown, Down >
2746 > reactions;
2747 boost::statechart::result react(const QueryState& q);
2748 boost::statechart::result react(const MNotifyRec& infoevt);
2749 };
2750
2751 struct GotLog : boost::statechart::event< GotLog > {
2752 GotLog() : boost::statechart::event< GotLog >() {}
2753 };
2754
2755 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2756 pg_shard_t auth_log_shard;
2757 boost::intrusive_ptr<MOSDPGLog> msg;
2758
2759 explicit GetLog(my_context ctx);
2760 void exit();
2761
2762 typedef boost::mpl::list <
2763 boost::statechart::custom_reaction< QueryState >,
2764 boost::statechart::custom_reaction< MLogRec >,
2765 boost::statechart::custom_reaction< GotLog >,
2766 boost::statechart::custom_reaction< AdvMap >,
2767 boost::statechart::transition< IsIncomplete, Incomplete >
2768 > reactions;
2769 boost::statechart::result react(const AdvMap&);
2770 boost::statechart::result react(const QueryState& q);
2771 boost::statechart::result react(const MLogRec& logevt);
2772 boost::statechart::result react(const GotLog&);
2773 };
2774
2775 struct WaitUpThru;
2776
2777 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2778 set<pg_shard_t> peer_missing_requested;
2779
2780 explicit GetMissing(my_context ctx);
2781 void exit();
2782
2783 typedef boost::mpl::list <
2784 boost::statechart::custom_reaction< QueryState >,
2785 boost::statechart::custom_reaction< MLogRec >,
2786 boost::statechart::transition< NeedUpThru, WaitUpThru >
2787 > reactions;
2788 boost::statechart::result react(const QueryState& q);
2789 boost::statechart::result react(const MLogRec& logevt);
2790 };
2791
2792 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2793 explicit WaitUpThru(my_context ctx);
2794 void exit();
2795
2796 typedef boost::mpl::list <
2797 boost::statechart::custom_reaction< QueryState >,
2798 boost::statechart::custom_reaction< ActMap >,
2799 boost::statechart::custom_reaction< MLogRec >
2800 > reactions;
2801 boost::statechart::result react(const QueryState& q);
2802 boost::statechart::result react(const ActMap& am);
2803 boost::statechart::result react(const MLogRec& logrec);
2804 };
2805
2806 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2807 explicit Down(my_context ctx);
2808 typedef boost::mpl::list <
11fdf7f2
TL
2809 boost::statechart::custom_reaction< QueryState >,
2810 boost::statechart::custom_reaction< MNotifyRec >
7c673cae 2811 > reactions;
11fdf7f2
TL
2812 boost::statechart::result react(const QueryState& q);
2813 boost::statechart::result react(const MNotifyRec& infoevt);
7c673cae
FG
2814 void exit();
2815 };
2816
2817 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2818 typedef boost::mpl::list <
2819 boost::statechart::custom_reaction< AdvMap >,
2820 boost::statechart::custom_reaction< MNotifyRec >,
2821 boost::statechart::custom_reaction< QueryState >
2822 > reactions;
2823 explicit Incomplete(my_context ctx);
2824 boost::statechart::result react(const AdvMap &advmap);
2825 boost::statechart::result react(const MNotifyRec& infoevt);
11fdf7f2 2826 boost::statechart::result react(const QueryState& q);
7c673cae
FG
2827 void exit();
2828 };
2829
7c673cae
FG
2830 RecoveryMachine machine;
2831 PG *pg;
2832
2833 /// context passed in by state machine caller
2834 RecoveryCtx *orig_ctx;
2835
2836 /// populated if we are buffering messages pending a flush
2837 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2838
2839 /**
2840 * populated between start_handle() and end_handle(), points into
2841 * the message lists for messages_pending_flush while blocking messages
2842 * or into orig_ctx otherwise
2843 */
2844 boost::optional<RecoveryCtx> rctx;
2845
2846 public:
2847 explicit RecoveryState(PG *pg)
2848 : machine(this, pg), pg(pg), orig_ctx(0) {
2849 machine.initiate();
2850 }
2851
2852 void handle_event(const boost::statechart::event_base &evt,
2853 RecoveryCtx *rctx) {
2854 start_handle(rctx);
2855 machine.process_event(evt);
2856 end_handle();
2857 }
2858
11fdf7f2 2859 void handle_event(PGPeeringEventRef evt,
7c673cae
FG
2860 RecoveryCtx *rctx) {
2861 start_handle(rctx);
2862 machine.process_event(evt->get_event());
2863 end_handle();
2864 }
2865
2866 } recovery_state;
2867
2868
7c673cae 2869
7c673cae
FG
2870 uint64_t peer_features;
2871 uint64_t acting_features;
2872 uint64_t upacting_features;
2873
2874 epoch_t last_epoch;
2875
11fdf7f2
TL
2876 /// most recently consumed osdmap's require_osd_version
2877 unsigned last_require_osd_release = 0;
2878 bool delete_needs_sleep = false;
a8e16298 2879
11fdf7f2 2880protected:
7c673cae
FG
2881 void reset_min_peer_features() {
2882 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2883 }
2884 uint64_t get_min_peer_features() const { return peer_features; }
2885 void apply_peer_features(uint64_t f) { peer_features &= f; }
2886
2887 uint64_t get_min_acting_features() const { return acting_features; }
2888 uint64_t get_min_upacting_features() const { return upacting_features; }
c07f9fc5
FG
2889 bool perform_deletes_during_peering() const {
2890 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2891 }
7c673cae 2892
f64942e4
AA
2893 bool hard_limit_pglog() const {
2894 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2895 }
2896
7c673cae
FG
2897 void init_primary_up_acting(
2898 const vector<int> &newup,
2899 const vector<int> &newacting,
2900 int new_up_primary,
2901 int new_acting_primary) {
2902 actingset.clear();
2903 acting = newacting;
2904 for (uint8_t i = 0; i < acting.size(); ++i) {
2905 if (acting[i] != CRUSH_ITEM_NONE)
2906 actingset.insert(
2907 pg_shard_t(
2908 acting[i],
11fdf7f2 2909 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
7c673cae
FG
2910 }
2911 upset.clear();
2912 up = newup;
2913 for (uint8_t i = 0; i < up.size(); ++i) {
2914 if (up[i] != CRUSH_ITEM_NONE)
2915 upset.insert(
2916 pg_shard_t(
2917 up[i],
11fdf7f2 2918 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
7c673cae 2919 }
11fdf7f2 2920 if (!pool.info.is_erasure()) {
7c673cae
FG
2921 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2922 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2923 return;
2924 }
2925 up_primary = pg_shard_t();
2926 primary = pg_shard_t();
2927 for (uint8_t i = 0; i < up.size(); ++i) {
2928 if (up[i] == new_up_primary) {
2929 up_primary = pg_shard_t(up[i], shard_id_t(i));
2930 break;
2931 }
2932 }
2933 for (uint8_t i = 0; i < acting.size(); ++i) {
2934 if (acting[i] == new_acting_primary) {
2935 primary = pg_shard_t(acting[i], shard_id_t(i));
2936 break;
2937 }
2938 }
11fdf7f2
TL
2939 ceph_assert(up_primary.osd == new_up_primary);
2940 ceph_assert(primary.osd == new_acting_primary);
7c673cae 2941 }
7c673cae 2942
11fdf7f2
TL
2943 void set_role(int r) {
2944 role = r;
2945 }
7c673cae 2946
11fdf7f2
TL
2947 bool state_test(uint64_t m) const { return (state & m) != 0; }
2948 void state_set(uint64_t m) { state |= m; }
2949 void state_clear(uint64_t m) { state &= ~m; }
7c673cae
FG
2950
2951 bool is_complete() const { return info.last_complete == info.last_update; }
2952 bool should_send_notify() const { return send_notify; }
2953
11fdf7f2
TL
2954 uint64_t get_state() const { return state; }
2955 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2956 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2957 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2958 bool is_down() const { return state_test(PG_STATE_DOWN); }
2959 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2960 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2961 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2962 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2963 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2964 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2965 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2966 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2967 bool is_peered() const {
7c673cae
FG
2968 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2969 }
91327a77 2970 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
11fdf7f2
TL
2971 bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2972 bool is_repair() const { return state_test(PG_STATE_REPAIR); }
7c673cae 2973
11fdf7f2 2974 bool is_empty() const { return info.last_update == eversion_t(0,0); }
7c673cae
FG
2975
2976 // pg on-disk state
2977 void do_pending_flush();
2978
11fdf7f2 2979public:
7c673cae
FG
2980 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2981 static void _init(ObjectStore::Transaction& t,
2982 spg_t pgid, const pg_pool_t *pool);
2983
11fdf7f2 2984protected:
7c673cae
FG
2985 void prepare_write_info(map<string,bufferlist> *km);
2986
2987 void update_store_with_options();
7c673cae
FG
2988
2989public:
2990 static int _prepare_write_info(
2991 CephContext* cct,
2992 map<string,bufferlist> *km,
2993 epoch_t epoch,
2994 pg_info_t &info,
2995 pg_info_t &last_written_info,
2996 PastIntervals &past_intervals,
2997 bool dirty_big_info,
2998 bool dirty_epoch,
2999 bool try_fast_info,
3000 PerfCounters *logger = nullptr);
11fdf7f2
TL
3001
3002 void write_if_dirty(RecoveryCtx *rctx) {
3003 write_if_dirty(*rctx->transaction);
3004 }
3005protected:
7c673cae
FG
3006 void write_if_dirty(ObjectStore::Transaction& t);
3007
3008 PGLog::IndexedLog projected_log;
3009 bool check_in_progress_op(
3010 const osd_reqid_t &r,
3011 eversion_t *version,
3012 version_t *user_version,
3013 int *return_code) const;
3014 eversion_t projected_last_update;
3015 eversion_t get_next_version() const {
3016 eversion_t at_version(
11fdf7f2 3017 get_osdmap_epoch(),
7c673cae 3018 projected_last_update.version+1);
11fdf7f2
TL
3019 ceph_assert(at_version > info.last_update);
3020 ceph_assert(at_version > pg_log.get_head());
3021 ceph_assert(at_version > projected_last_update);
7c673cae
FG
3022 return at_version;
3023 }
3024
3025 void add_log_entry(const pg_log_entry_t& e, bool applied);
3026 void append_log(
3027 const vector<pg_log_entry_t>& logv,
3028 eversion_t trim_to,
3029 eversion_t roll_forward_to,
3030 ObjectStore::Transaction &t,
11fdf7f2
TL
3031 bool transaction_applied = true,
3032 bool async = false);
7c673cae 3033 bool check_log_for_corruption(ObjectStore *store);
7c673cae
FG
3034
3035 std::string get_corrupt_pg_log_name() const;
11fdf7f2 3036
7c673cae
FG
3037 void update_snap_map(
3038 const vector<pg_log_entry_t> &log_entries,
3039 ObjectStore::Transaction& t);
3040
3041 void filter_snapc(vector<snapid_t> &snaps);
3042
3043 void log_weirdness();
3044
3045 virtual void kick_snap_trim() = 0;
3046 virtual void snap_trimmer_scrub_complete() = 0;
31f18b77 3047 bool requeue_scrub(bool high_priority = false);
c07f9fc5 3048 void queue_recovery();
7c673cae
FG
3049 bool queue_scrub();
3050 unsigned get_scrub_priority();
3051
3052 /// share pg info after a pg is active
3053 void share_pg_info();
3054
3055
3056 bool append_log_entries_update_missing(
31f18b77 3057 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
3058 ObjectStore::Transaction &t,
3059 boost::optional<eversion_t> trim_to,
3060 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
3061
3062 /**
3063 * Merge entries updating missing as necessary on all
11fdf7f2 3064 * acting_recovery_backfill logs and missings (also missing_loc)
7c673cae
FG
3065 */
3066 void merge_new_log_entries(
31f18b77 3067 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
3068 ObjectStore::Transaction &t,
3069 boost::optional<eversion_t> trim_to,
3070 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
3071
3072 void reset_interval_flush();
3073 void start_peering_interval(
3074 const OSDMapRef lastmap,
3075 const vector<int>& newup, int up_primary,
3076 const vector<int>& newacting, int acting_primary,
3077 ObjectStore::Transaction *t);
3078 void on_new_interval();
3079 virtual void _on_new_interval() = 0;
11fdf7f2 3080 void start_flush(ObjectStore::Transaction *t);
7c673cae 3081 void set_last_peering_reset();
7c673cae
FG
3082
3083 void update_history(const pg_history_t& history);
3084 void fulfill_info(pg_shard_t from, const pg_query_t &query,
3085 pair<pg_shard_t, pg_info_t> &notify_info);
3086 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
1adf2230 3087 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
7c673cae
FG
3088 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3089
3090 bool should_restart_peering(
3091 int newupprimary,
3092 int newactingprimary,
3093 const vector<int>& newup,
3094 const vector<int>& newacting,
3095 OSDMapRef lastmap,
3096 OSDMapRef osdmap);
3097
3098 // OpRequest queueing
3099 bool can_discard_op(OpRequestRef& op);
3100 bool can_discard_scan(OpRequestRef op);
3101 bool can_discard_backfill(OpRequestRef op);
3102 bool can_discard_request(OpRequestRef& op);
3103
3104 template<typename T, int MSGTYPE>
3105 bool can_discard_replica_op(OpRequestRef& op);
3106
3107 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
11fdf7f2 3108 bool old_peering_evt(PGPeeringEventRef evt) {
7c673cae
FG
3109 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3110 }
3111 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3112 return e <= cur_epoch;
3113 }
3114 bool have_same_or_newer_map(epoch_t e) {
11fdf7f2 3115 return e <= get_osdmap_epoch();
7c673cae
FG
3116 }
3117
3118 bool op_has_sufficient_caps(OpRequestRef& op);
3119
3120
3121 // recovery bits
3122 void take_waiters();
7c673cae
FG
3123
3124
3125 // abstract bits
11fdf7f2 3126 friend class FlushState;
7c673cae
FG
3127
3128 virtual void on_role_change() = 0;
3129 virtual void on_pool_change() = 0;
3130 virtual void on_change(ObjectStore::Transaction *t) = 0;
3131 virtual void on_activate() = 0;
3132 virtual void on_flushed() = 0;
7c673cae 3133 virtual void check_blacklisted_watchers() = 0;
7c673cae 3134
11fdf7f2 3135 friend ostream& operator<<(ostream& out, const PG& pg);
7c673cae
FG
3136};
3137
7c673cae
FG
3138
3139ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3140
3141#endif