]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.h
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / osd / PG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PG_H
16#define CEPH_PG_H
17
18#include <boost/statechart/custom_reaction.hpp>
19#include <boost/statechart/event.hpp>
20#include <boost/statechart/simple_state.hpp>
21#include <boost/statechart/state.hpp>
22#include <boost/statechart/state_machine.hpp>
23#include <boost/statechart/transition.hpp>
24#include <boost/statechart/event_base.hpp>
25#include <boost/scoped_ptr.hpp>
26#include <boost/circular_buffer.hpp>
91327a77 27#include <boost/container/flat_set.hpp>
7c673cae
FG
28#include "include/mempool.h"
29
30// re-include our assert to clobber boost's
11fdf7f2 31#include "include/ceph_assert.h"
7c673cae
FG
32
33#include "include/types.h"
34#include "include/stringify.h"
35#include "osd_types.h"
36#include "include/xlist.h"
37#include "SnapMapper.h"
38#include "Session.h"
39#include "common/Timer.h"
40
41#include "PGLog.h"
42#include "OSDMap.h"
43#include "messages/MOSDPGLog.h"
44#include "include/str_list.h"
45#include "PGBackend.h"
11fdf7f2
TL
46#include "PGPeeringEvent.h"
47
48#include "mgr/OSDPerfMetricTypes.h"
7c673cae
FG
49
50#include <atomic>
51#include <list>
52#include <memory>
53#include <stack>
54#include <string>
55#include <tuple>
7c673cae
FG
56
57//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
11fdf7f2 58//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
7c673cae
FG
59
60class OSD;
61class OSDService;
11fdf7f2
TL
62class OSDShard;
63class OSDShardPGSlot;
7c673cae
FG
64class MOSDOp;
65class MOSDPGScan;
66class MOSDPGBackfill;
67class MOSDPGInfo;
68
69class PG;
70struct OpRequest;
71typedef OpRequest::Ref OpRequestRef;
72class MOSDPGLog;
73class CephContext;
11fdf7f2 74class DynamicPerfStats;
7c673cae
FG
75
76namespace Scrub {
77 class Store;
78}
79
7c673cae
FG
80using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81using embedded_state = std::pair<utime_t, const char*>;
82
83struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105};
106
107class PGStateHistory {
108 // Member access protected with the PG lock
109public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
a8e16298
TL
124 string get_current_state() {
125 if (pi == nullptr) return "unknown";
126 return std::get<1>(pi->embedded_states.top());
127 }
128
7c673cae
FG
129private:
130 bool pg_in_destructor = false;
131 PG* thispg = nullptr;
132 std::unique_ptr<PGStateInstance> tmppi;
133 PGStateInstance* pi = nullptr;
134 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
135
136};
137
138#ifdef PG_DEBUG_REFS
139#include "common/tracked_int_ptr.hpp"
140 uint64_t get_with_id(PG *pg);
141 void put_with_id(PG *pg, uint64_t id);
142 typedef TrackedIntPtr<PG> PGRef;
143#else
144 typedef boost::intrusive_ptr<PG> PGRef;
145#endif
146
147class PGRecoveryStats {
148 struct per_state_info {
149 uint64_t enter, exit; // enter/exit counts
150 uint64_t events;
151 utime_t event_time; // time spent processing events
152 utime_t total_time; // total time in state
153 utime_t min_time, max_time;
154
155 // cppcheck-suppress unreachableCode
156 per_state_info() : enter(0), exit(0), events(0) {}
157 };
158 map<const char *,per_state_info> info;
159 Mutex lock;
160
161 public:
162 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
163
164 void reset() {
11fdf7f2 165 std::lock_guard l(lock);
7c673cae
FG
166 info.clear();
167 }
168 void dump(ostream& out) {
11fdf7f2 169 std::lock_guard l(lock);
7c673cae
FG
170 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
171 per_state_info& i = p->second;
172 out << i.enter << "\t" << i.exit << "\t"
173 << i.events << "\t" << i.event_time << "\t"
174 << i.total_time << "\t"
175 << i.min_time << "\t" << i.max_time << "\t"
176 << p->first << "\n";
177 }
178 }
179
180 void dump_formatted(Formatter *f) {
11fdf7f2 181 std::lock_guard l(lock);
7c673cae
FG
182 f->open_array_section("pg_recovery_stats");
183 for (map<const char *,per_state_info>::iterator p = info.begin();
184 p != info.end(); ++p) {
185 per_state_info& i = p->second;
186 f->open_object_section("recovery_state");
187 f->dump_int("enter", i.enter);
188 f->dump_int("exit", i.exit);
189 f->dump_int("events", i.events);
190 f->dump_stream("event_time") << i.event_time;
191 f->dump_stream("total_time") << i.total_time;
192 f->dump_stream("min_time") << i.min_time;
193 f->dump_stream("max_time") << i.max_time;
194 vector<string> states;
195 get_str_vec(p->first, "/", states);
196 f->open_array_section("nested_states");
197 for (vector<string>::iterator st = states.begin();
198 st != states.end(); ++st) {
199 f->dump_string("state", *st);
200 }
201 f->close_section();
202 f->close_section();
203 }
204 f->close_section();
205 }
206
207 void log_enter(const char *s) {
11fdf7f2 208 std::lock_guard l(lock);
7c673cae
FG
209 info[s].enter++;
210 }
211 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
11fdf7f2 212 std::lock_guard l(lock);
7c673cae
FG
213 per_state_info &i = info[s];
214 i.exit++;
215 i.total_time += dur;
216 if (dur > i.max_time)
217 i.max_time = dur;
218 if (dur < i.min_time || i.min_time == utime_t())
219 i.min_time = dur;
220 i.events += events;
221 i.event_time += event_dur;
222 }
223};
224
225struct PGPool {
226 CephContext* cct;
227 epoch_t cached_epoch;
228 int64_t id;
229 string name;
7c673cae
FG
230
231 pg_pool_t info;
232 SnapContext snapc; // the default pool snapc, ready to go.
233
11fdf7f2 234 // these two sets are for < mimic only
7c673cae
FG
235 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
236 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
237
11fdf7f2
TL
238 PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
239 const string& name)
7c673cae
FG
240 : cct(cct),
241 cached_epoch(map->get_epoch()),
242 id(i),
11fdf7f2
TL
243 name(name),
244 info(info) {
245 snapc = info.get_snap_context();
246 if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
247 info.build_removed_snaps(cached_removed_snaps);
248 }
7c673cae
FG
249 }
250
11fdf7f2 251 void update(CephContext *cct, OSDMapRef map);
7c673cae
FG
252};
253
254/** PG - Replica Placement Group
255 *
256 */
257
258class PG : public DoutPrefixProvider {
a8e16298 259public:
11fdf7f2
TL
260 // -- members --
261 const spg_t pg_id;
262 const coll_t coll;
263
264 ObjectStore::CollectionHandle ch;
265
266 struct RecoveryCtx;
267
268 // -- methods --
269 std::ostream& gen_prefix(std::ostream& out) const override;
270 CephContext *get_cct() const override {
271 return cct;
272 }
273 unsigned get_subsys() const override {
274 return ceph_subsys_osd;
275 }
276
277 const OSDMapRef& get_osdmap() const {
278 ceph_assert(is_locked());
279 ceph_assert(osdmap_ref);
280 return osdmap_ref;
281 }
282 epoch_t get_osdmap_epoch() const {
283 return osdmap_ref->get_epoch();
284 }
285
286 void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
287 handle.suspend_tp_timeout();
288 lock();
289 handle.reset_tp_timeout();
290 }
291 void lock(bool no_lockdep = false) const;
292 void unlock() const {
293 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
294 ceph_assert(!dirty_info);
295 ceph_assert(!dirty_big_info);
296 _lock.Unlock();
297 }
298 bool is_locked() const {
299 return _lock.is_locked();
300 }
301
302 const spg_t& get_pgid() const {
303 return pg_id;
304 }
305
306 const PGPool& get_pool() const {
307 return pool;
308 }
309 uint64_t get_last_user_version() const {
310 return info.last_user_version;
311 }
312 const pg_history_t& get_history() const {
313 return info.history;
314 }
315 bool get_need_up_thru() const {
316 return need_up_thru;
317 }
318 epoch_t get_same_interval_since() const {
319 return info.history.same_interval_since;
320 }
321
322 void set_last_scrub_stamp(utime_t t) {
323 info.stats.last_scrub_stamp = t;
324 info.history.last_scrub_stamp = t;
325 }
326
327 void set_last_deep_scrub_stamp(utime_t t) {
328 info.stats.last_deep_scrub_stamp = t;
329 info.history.last_deep_scrub_stamp = t;
330 }
331
332 bool is_deleting() const {
333 return deleting;
334 }
335 bool is_deleted() const {
336 return deleted;
337 }
338 bool is_replica() const {
339 return role > 0;
340 }
341 bool is_primary() const {
342 return pg_whoami == primary;
343 }
344 bool pg_has_reset_since(epoch_t e) {
345 ceph_assert(is_locked());
346 return deleted || e < get_last_peering_reset();
347 }
348
349 bool is_ec_pg() const {
350 return pool.info.is_erasure();
351 }
352 int get_role() const {
353 return role;
354 }
355 const vector<int> get_acting() const {
356 return acting;
357 }
358 int get_acting_primary() const {
359 return primary.osd;
360 }
361 pg_shard_t get_primary() const {
362 return primary;
363 }
364 const vector<int> get_up() const {
365 return up;
366 }
367 int get_up_primary() const {
368 return up_primary.osd;
369 }
370 const PastIntervals& get_past_intervals() const {
371 return past_intervals;
372 }
373
374 /// initialize created PG
375 void init(
376 int role,
377 const vector<int>& up,
378 int up_primary,
379 const vector<int>& acting,
380 int acting_primary,
381 const pg_history_t& history,
382 const PastIntervals& pim,
383 bool backfill,
384 ObjectStore::Transaction *t);
385
386 /// read existing pg state off disk
387 void read_state(ObjectStore *store);
388 static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
389
390 static int get_latest_struct_v() {
391 return latest_struct_v;
392 }
393 static int get_compat_struct_v() {
394 return compat_struct_v;
395 }
396 static int read_info(
397 ObjectStore *store, spg_t pgid, const coll_t &coll,
398 pg_info_t &info, PastIntervals &past_intervals,
399 __u8 &);
400 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
401
402 void rm_backoff(BackoffRef b);
403
404 void update_snap_mapper_bits(uint32_t bits) {
405 snap_mapper.update_bits(bits);
406 }
407 void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
408 virtual void split_colls(
409 spg_t child,
410 int split_bits,
411 int seed,
412 const pg_pool_t *pool,
413 ObjectStore::Transaction *t) = 0;
414 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
415 void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
416 unsigned split_bits,
417 const pg_merge_meta_t& last_pg_merge_meta);
418 void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
419
420 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
494da23a
TL
421
422 bool is_scrub_registered();
11fdf7f2
TL
423 void reg_next_scrub();
424 void unreg_next_scrub();
425
494da23a
TL
426 void on_info_history_change();
427
428 void scrub_requested(bool deep, bool repair, bool need_auto = false);
429
11fdf7f2
TL
430 bool is_forced_recovery_or_backfill() const {
431 return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
432 }
a8e16298
TL
433 bool set_force_recovery(bool b);
434 bool set_force_backfill(bool b);
435
11fdf7f2
TL
436 void queue_peering_event(PGPeeringEventRef evt);
437 void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
438 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
439 void queue_flushed(epoch_t started_at);
440 void handle_advance_map(
441 OSDMapRef osdmap, OSDMapRef lastmap,
442 vector<int>& newup, int up_primary,
443 vector<int>& newacting, int acting_primary,
444 RecoveryCtx *rctx);
445 void handle_activate_map(RecoveryCtx *rctx);
446 void handle_initialize(RecoveryCtx *rctx);
447 void handle_query_state(Formatter *f);
448
449 /**
450 * @param ops_begun returns how many recovery ops the function started
451 * @returns true if any useful work was accomplished; false otherwise
452 */
453 virtual bool start_recovery_ops(
454 uint64_t max,
455 ThreadPool::TPHandle &handle,
456 uint64_t *ops_begun) = 0;
457
458 // more work after the above, but with a RecoveryCtx
459 void find_unfound(epoch_t queued, RecoveryCtx *rctx);
460
461 virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
462
463 void dump_pgstate_history(Formatter *f);
464 void dump_missing(Formatter *f);
465
466 void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
467 void with_heartbeat_peers(std::function<void(int)> f);
468
469 void shutdown();
470 virtual void on_shutdown() = 0;
471
472 bool get_must_scrub() const {
473 return scrubber.must_scrub;
474 }
475 bool sched_scrub();
476
477 virtual void do_request(
478 OpRequestRef& op,
479 ThreadPool::TPHandle &handle
480 ) = 0;
481 virtual void clear_cache() = 0;
482 virtual int get_cache_obj_count() = 0;
483
484 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
485 virtual int do_command(
486 cmdmap_t cmdmap,
487 ostream& ss,
488 bufferlist& idata,
489 bufferlist& odata,
490 ConnectionRef conn,
491 ceph_tid_t tid) = 0;
492
493 virtual bool agent_work(int max) = 0;
494 virtual bool agent_work(int max, int agent_flush_quota) = 0;
495 virtual void agent_stop() = 0;
496 virtual void agent_delay() = 0;
497 virtual void agent_clear() = 0;
498 virtual void agent_choose_mode_restart() = 0;
499
500 virtual void on_removal(ObjectStore::Transaction *t) = 0;
501
502 void _delete_some(ObjectStore::Transaction *t);
503
504 virtual void set_dynamic_perf_stats_queries(
505 const std::list<OSDPerfMetricQuery> &queries) {
506 }
507 virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
508 }
509
510 // reference counting
511#ifdef PG_DEBUG_REFS
512 uint64_t get_with_id();
513 void put_with_id(uint64_t);
514 void dump_live_ids();
515#endif
516 void get(const char* tag);
517 void put(const char* tag);
518 int get_num_ref() {
519 return ref;
520 }
521
522 // ctor
523 PG(OSDService *o, OSDMapRef curmap,
524 const PGPool &pool, spg_t p);
525 ~PG() override;
526
527 // prevent copying
528 explicit PG(const PG& rhs) = delete;
529 PG& operator=(const PG& rhs) = delete;
530
7c673cae 531protected:
11fdf7f2
TL
532 // -------------
533 // protected
7c673cae 534 OSDService *osd;
11fdf7f2
TL
535public:
536 OSDShard *osd_shard = nullptr;
537 OSDShardPGSlot *pg_slot = nullptr;
538protected:
7c673cae 539 CephContext *cct;
11fdf7f2
TL
540
541 // osdmap
542 OSDMapRef osdmap_ref;
543
544 PGPool pool;
545
546 // locking and reference counting.
547 // I destroy myself when the reference count hits zero.
548 // lock() should be called before doing anything.
549 // get() should be called on pointer copy (to another thread, etc.).
550 // put() should be called on destruction of some previously copied pointer.
551 // unlock() when done with the current pointer (_most common_).
552 mutable Mutex _lock = {"PG::_lock"};
553
554 std::atomic<unsigned int> ref{0};
555
556#ifdef PG_DEBUG_REFS
557 Mutex _ref_id_lock = {"PG::_ref_id_lock"};
558 map<uint64_t, string> _live_ids;
559 map<string, uint64_t> _tag_counts;
560 uint64_t _ref_id = 0;
561
562 friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
563 friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
564#endif
565
566private:
567 friend void intrusive_ptr_add_ref(PG *pg) {
568 pg->get("intptr");
569 }
570 friend void intrusive_ptr_release(PG *pg) {
571 pg->put("intptr");
572 }
573
574
575 // =====================
576
577protected:
7c673cae
FG
578 OSDriver osdriver;
579 SnapMapper snap_mapper;
224ce89b 580 bool eio_errors_to_process = false;
7c673cae
FG
581
582 virtual PGBackend *get_pgbackend() = 0;
11fdf7f2 583 virtual const PGBackend* get_pgbackend() const = 0;
7c673cae 584
11fdf7f2 585protected:
7c673cae 586 /*** PG ****/
7c673cae 587 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
11fdf7f2 588 IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
7c673cae
FG
589 return get_pgbackend()->get_is_recoverable_predicate();
590 }
591protected:
11fdf7f2 592 epoch_t last_persisted_osdmap;
7c673cae
FG
593
594 void requeue_map_waiters();
595
596 void update_osdmap_ref(OSDMapRef newmap) {
11fdf7f2 597 ceph_assert(_lock.is_locked_by_me());
7c673cae
FG
598 osdmap_ref = std::move(newmap);
599 }
600
7c673cae
FG
601protected:
602
7c673cae 603
7c673cae 604 bool deleting; // true while in removing or OSD is shutting down
11fdf7f2 605 atomic<bool> deleted = {false};
7c673cae
FG
606
607 ZTracer::Endpoint trace_endpoint;
608
7c673cae 609
11fdf7f2 610protected:
7c673cae
FG
611 bool dirty_info, dirty_big_info;
612
11fdf7f2 613protected:
7c673cae
FG
614 // pg state
615 pg_info_t info; ///< current pg info
616 pg_info_t last_written_info; ///< last written info
11fdf7f2
TL
617 __u8 info_struct_v = 0;
618 static const __u8 latest_struct_v = 10;
7c673cae
FG
619 // v10 is the new past_intervals encoding
620 // v9 was fastinfo_key addition
621 // v8 was the move to a per-pg pgmeta object
622 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
623 // (first appeared in cuttlefish).
11fdf7f2 624 static const __u8 compat_struct_v = 10;
7c673cae
FG
625 void upgrade(ObjectStore *store);
626
11fdf7f2 627protected:
7c673cae 628 PGLog pg_log;
7c673cae
FG
629 ghobject_t pgmeta_oid;
630
91327a77
AA
631 // ------------------
632 // MissingLoc
633
7c673cae 634 class MissingLoc {
91327a77
AA
635 public:
636 // a loc_count indicates how many locations we know in each of
637 // these distinct sets
638 struct loc_count_t {
639 int up = 0; //< up
640 int other = 0; //< other
641
642 friend bool operator<(const loc_count_t& l,
643 const loc_count_t& r) {
644 return (l.up < r.up ||
645 (l.up == r.up &&
646 (l.other < r.other)));
647 }
648 friend ostream& operator<<(ostream& out, const loc_count_t& l) {
11fdf7f2
TL
649 ceph_assert(l.up >= 0);
650 ceph_assert(l.other >= 0);
91327a77
AA
651 return out << "(" << l.up << "+" << l.other << ")";
652 }
653 };
654
655
656 private:
657
658 loc_count_t _get_count(const set<pg_shard_t>& shards) {
659 loc_count_t r;
660 for (auto s : shards) {
661 if (pg->upset.count(s)) {
662 r.up++;
663 } else {
664 r.other++;
665 }
666 }
667 return r;
668 }
669
7c673cae
FG
670 map<hobject_t, pg_missing_item> needs_recovery_map;
671 map<hobject_t, set<pg_shard_t> > missing_loc;
672 set<pg_shard_t> missing_loc_sources;
91327a77
AA
673
674 // for every entry in missing_loc, we count how many of each type of shard we have,
675 // and maintain totals here. The sum of the values for this map will always equal
676 // missing_loc.size().
677 map < shard_id_t, map<loc_count_t,int> > missing_by_count;
678
679 void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
680 if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
681 int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
682 // For completely missing shards initialize with empty set<pg_shard_t>
683 for (int i = 0 ; i < num_shards ; ++i) {
684 shard_id_t shard(i);
685 pgsbs[shard];
686 }
687 for (auto pgs: s)
688 pgsbs[pgs.shard].insert(pgs);
689 } else {
690 pgsbs[shard_id_t::NO_SHARD] = s;
691 }
692 }
693
694 void _inc_count(const set<pg_shard_t>& s) {
695 map< shard_id_t, set<pg_shard_t> > pgsbs;
696 pgs_by_shard_id(s, pgsbs);
697 for (auto shard: pgsbs)
698 ++missing_by_count[shard.first][_get_count(shard.second)];
699 }
700 void _dec_count(const set<pg_shard_t>& s) {
701 map< shard_id_t, set<pg_shard_t> > pgsbs;
702 pgs_by_shard_id(s, pgsbs);
703 for (auto shard: pgsbs) {
704 auto p = missing_by_count[shard.first].find(_get_count(shard.second));
11fdf7f2 705 ceph_assert(p != missing_by_count[shard.first].end());
91327a77
AA
706 if (--p->second == 0) {
707 missing_by_count[shard.first].erase(p);
708 }
709 }
710 }
711
7c673cae
FG
712 PG *pg;
713 set<pg_shard_t> empty_set;
714 public:
715 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
716 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
717 explicit MissingLoc(PG *pg)
91327a77 718 : pg(pg) { }
7c673cae
FG
719 void set_backend_predicates(
720 IsPGReadablePredicate *_is_readable,
721 IsPGRecoverablePredicate *_is_recoverable) {
722 is_readable.reset(_is_readable);
723 is_recoverable.reset(_is_recoverable);
724 }
11fdf7f2
TL
725 std::ostream& gen_prefix(std::ostream& out) const {
726 return pg->gen_prefix(out);
727 }
7c673cae
FG
728 bool needs_recovery(
729 const hobject_t &hoid,
730 eversion_t *v = 0) const {
731 map<hobject_t, pg_missing_item>::const_iterator i =
732 needs_recovery_map.find(hoid);
733 if (i == needs_recovery_map.end())
734 return false;
735 if (v)
736 *v = i->second.need;
737 return true;
738 }
c07f9fc5
FG
739 bool is_deleted(const hobject_t &hoid) const {
740 auto i = needs_recovery_map.find(hoid);
741 if (i == needs_recovery_map.end())
742 return false;
743 return i->second.is_delete();
744 }
7c673cae 745 bool is_unfound(const hobject_t &hoid) const {
11fdf7f2
TL
746 auto it = needs_recovery_map.find(hoid);
747 if (it == needs_recovery_map.end()) {
748 return false;
749 }
750 if (it->second.is_delete()) {
751 return false;
752 }
753 auto mit = missing_loc.find(hoid);
754 return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
7c673cae
FG
755 }
756 bool readable_with_acting(
757 const hobject_t &hoid,
758 const set<pg_shard_t> &acting) const;
759 uint64_t num_unfound() const {
760 uint64_t ret = 0;
761 for (map<hobject_t, pg_missing_item>::const_iterator i =
762 needs_recovery_map.begin();
763 i != needs_recovery_map.end();
764 ++i) {
11fdf7f2
TL
765 if (i->second.is_delete())
766 continue;
767 auto mi = missing_loc.find(i->first);
768 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
7c673cae
FG
769 ++ret;
770 }
771 return ret;
772 }
773
774 bool have_unfound() const {
775 for (map<hobject_t, pg_missing_item>::const_iterator i =
776 needs_recovery_map.begin();
777 i != needs_recovery_map.end();
778 ++i) {
11fdf7f2
TL
779 if (i->second.is_delete())
780 continue;
781 auto mi = missing_loc.find(i->first);
782 if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
7c673cae
FG
783 return true;
784 }
785 return false;
786 }
787 void clear() {
788 needs_recovery_map.clear();
789 missing_loc.clear();
790 missing_loc_sources.clear();
91327a77 791 missing_by_count.clear();
7c673cae
FG
792 }
793
794 void add_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
795 auto p = missing_loc.find(hoid);
796 if (p == missing_loc.end()) {
797 p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
798 } else {
799 _dec_count(p->second);
800 }
801 p->second.insert(location);
802 _inc_count(p->second);
7c673cae
FG
803 }
804 void remove_location(const hobject_t &hoid, pg_shard_t location) {
91327a77
AA
805 auto p = missing_loc.find(hoid);
806 if (p != missing_loc.end()) {
807 _dec_count(p->second);
808 p->second.erase(location);
11fdf7f2
TL
809 if (p->second.empty()) {
810 missing_loc.erase(p);
811 } else {
812 _inc_count(p->second);
813 }
91327a77 814 }
7c673cae 815 }
11fdf7f2
TL
816
817 void clear_location(const hobject_t &hoid) {
818 auto p = missing_loc.find(hoid);
819 if (p != missing_loc.end()) {
820 _dec_count(p->second);
821 missing_loc.erase(p);
822 }
823 }
824
7c673cae
FG
825 void add_active_missing(const pg_missing_t &missing) {
826 for (map<hobject_t, pg_missing_item>::const_iterator i =
827 missing.get_items().begin();
828 i != missing.get_items().end();
829 ++i) {
830 map<hobject_t, pg_missing_item>::const_iterator j =
831 needs_recovery_map.find(i->first);
832 if (j == needs_recovery_map.end()) {
833 needs_recovery_map.insert(*i);
834 } else {
c07f9fc5
FG
835 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
836 << i->first << " have " << j->second
837 << " tried to add " << i->second << dendl;
11fdf7f2 838 ceph_assert(i->second.need == j->second.need);
7c673cae
FG
839 }
840 }
841 }
842
11fdf7f2
TL
843 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
844 needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
7c673cae
FG
845 }
846 void revise_need(const hobject_t &hoid, eversion_t need) {
11fdf7f2
TL
847 auto it = needs_recovery_map.find(hoid);
848 ceph_assert(it != needs_recovery_map.end());
849 it->second.need = need;
7c673cae
FG
850 }
851
852 /// Adds info about a possible recovery source
853 bool add_source_info(
854 pg_shard_t source, ///< [in] source
855 const pg_info_t &oinfo, ///< [in] info
856 const pg_missing_t &omissing, ///< [in] (optional) missing
857 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
858 ); ///< @return whether a new object location was discovered
859
860 /// Adds recovery sources in batch
861 void add_batch_sources_info(
862 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
863 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
864 );
865
866 /// Uses osdmap to update structures for now down sources
867 void check_recovery_sources(const OSDMapRef& osdmap);
868
869 /// Call when hoid is no longer missing in acting set
870 void recovered(const hobject_t &hoid) {
871 needs_recovery_map.erase(hoid);
91327a77
AA
872 auto p = missing_loc.find(hoid);
873 if (p != missing_loc.end()) {
874 _dec_count(p->second);
875 missing_loc.erase(p);
876 }
7c673cae
FG
877 }
878
879 /// Call to update structures for hoid after a change
880 void rebuild(
881 const hobject_t &hoid,
882 pg_shard_t self,
883 const set<pg_shard_t> to_recover,
884 const pg_info_t &info,
885 const pg_missing_t &missing,
886 const map<pg_shard_t, pg_missing_t> &pmissing,
887 const map<pg_shard_t, pg_info_t> &pinfo) {
888 recovered(hoid);
889 boost::optional<pg_missing_item> item;
890 auto miter = missing.get_items().find(hoid);
891 if (miter != missing.get_items().end()) {
892 item = miter->second;
893 } else {
894 for (auto &&i: to_recover) {
895 if (i == self)
896 continue;
897 auto pmiter = pmissing.find(i);
11fdf7f2 898 ceph_assert(pmiter != pmissing.end());
7c673cae
FG
899 miter = pmiter->second.get_items().find(hoid);
900 if (miter != pmiter->second.get_items().end()) {
901 item = miter->second;
902 break;
903 }
904 }
905 }
906 if (!item)
907 return; // recovered!
908
909 needs_recovery_map[hoid] = *item;
c07f9fc5
FG
910 if (item->is_delete())
911 return;
7c673cae
FG
912 auto mliter =
913 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
11fdf7f2
TL
914 ceph_assert(info.last_backfill.is_max());
915 ceph_assert(info.last_update >= item->need);
7c673cae
FG
916 if (!missing.is_missing(hoid))
917 mliter->second.insert(self);
918 for (auto &&i: pmissing) {
11fdf7f2
TL
919 if (i.first == self)
920 continue;
7c673cae 921 auto pinfoiter = pinfo.find(i.first);
11fdf7f2 922 ceph_assert(pinfoiter != pinfo.end());
7c673cae
FG
923 if (item->need <= pinfoiter->second.last_update &&
924 hoid <= pinfoiter->second.last_backfill &&
925 !i.second.is_missing(hoid))
926 mliter->second.insert(i.first);
927 }
91327a77 928 _inc_count(mliter->second);
7c673cae
FG
929 }
930
931 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
11fdf7f2
TL
932 auto it = missing_loc.find(hoid);
933 return it == missing_loc.end() ? empty_set : it->second;
7c673cae
FG
934 }
935 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
936 return missing_loc;
937 }
938 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
939 return needs_recovery_map;
940 }
91327a77
AA
941 const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
942 return missing_by_count;
943 }
7c673cae
FG
944 } missing_loc;
945
946 PastIntervals past_intervals;
947
948 interval_set<snapid_t> snap_trimq;
949
950 /* You should not use these items without taking their respective queue locks
951 * (if they have one) */
952 xlist<PG*>::item stat_queue_item;
953 bool scrub_queued;
954 bool recovery_queued;
955
956 int recovery_ops_active;
957 set<pg_shard_t> waiting_on_backfill;
958#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 959 multiset<hobject_t> recovering_oids;
7c673cae
FG
960#endif
961
962protected:
963 int role; // 0 = primary, 1 = replica, -1=none.
11fdf7f2 964 uint64_t state; // PG_STATE_*
7c673cae
FG
965
966 bool send_notify; ///< true if we are non-primary and should notify the primary
967
11fdf7f2 968protected:
7c673cae
FG
969 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
970 eversion_t last_complete_ondisk; // last_complete that has committed.
971 eversion_t last_update_applied;
972
11fdf7f2 973 // entries <= last_rollback_info_trimmed_to_applied have been trimmed
7c673cae
FG
974 eversion_t last_rollback_info_trimmed_to_applied;
975
976 // primary state
11fdf7f2 977protected:
7c673cae
FG
978 pg_shard_t primary;
979 pg_shard_t pg_whoami;
980 pg_shard_t up_primary;
981 vector<int> up, acting, want_acting;
11fdf7f2
TL
982 // acting_recovery_backfill contains shards that are acting,
983 // async recovery targets, or backfill targets.
984 set<pg_shard_t> acting_recovery_backfill, actingset, upset;
7c673cae
FG
985 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
986 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
987 eversion_t pg_trim_to;
988
989 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
990
11fdf7f2 991protected:
7c673cae 992 // [primary only] content recovery state
7c673cae
FG
993 struct BufferedRecoveryMessages {
994 map<int, map<spg_t, pg_query_t> > query_map;
995 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
996 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
997 };
998
94b18763
FG
999public:
1000 bool dne() { return info.dne(); }
7c673cae
FG
1001 struct RecoveryCtx {
1002 utime_t start_time;
1003 map<int, map<spg_t, pg_query_t> > *query_map;
1004 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
1005 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
7c673cae
FG
1006 ObjectStore::Transaction *transaction;
1007 ThreadPool::TPHandle* handle;
1008 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1009 map<int,
1010 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1011 map<int,
1012 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
7c673cae
FG
1013 ObjectStore::Transaction *transaction)
1014 : query_map(query_map), info_map(info_map),
1015 notify_list(notify_list),
7c673cae
FG
1016 transaction(transaction),
1017 handle(NULL) {}
1018
1019 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1020 : query_map(&(buf.query_map)),
1021 info_map(&(buf.info_map)),
1022 notify_list(&(buf.notify_list)),
7c673cae
FG
1023 transaction(rctx.transaction),
1024 handle(rctx.handle) {}
1025
1026 void accept_buffered_messages(BufferedRecoveryMessages &m) {
11fdf7f2
TL
1027 ceph_assert(query_map);
1028 ceph_assert(info_map);
1029 ceph_assert(notify_list);
7c673cae
FG
1030 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1031 i != m.query_map.end();
1032 ++i) {
1033 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1034 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1035 j != i->second.end();
1036 ++j) {
1037 omap[j->first] = j->second;
1038 }
1039 }
1040 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1041 = m.info_map.begin();
1042 i != m.info_map.end();
1043 ++i) {
1044 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1045 (*info_map)[i->first];
1046 ovec.reserve(ovec.size() + i->second.size());
1047 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1048 }
1049 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1050 = m.notify_list.begin();
1051 i != m.notify_list.end();
1052 ++i) {
1053 vector<pair<pg_notify_t, PastIntervals> > &ovec =
1054 (*notify_list)[i->first];
1055 ovec.reserve(ovec.size() + i->second.size());
1056 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1057 }
1058 }
1adf2230
AA
1059
1060 void send_notify(pg_shard_t to,
1061 const pg_notify_t &info, const PastIntervals &pi) {
11fdf7f2 1062 ceph_assert(notify_list);
1adf2230
AA
1063 (*notify_list)[to.osd].push_back(make_pair(info, pi));
1064 }
7c673cae 1065 };
11fdf7f2 1066protected:
7c673cae
FG
1067
1068 PGStateHistory pgstate_history;
1069
1070 struct NamedState {
1071 const char *state_name;
1072 utime_t enter_time;
1073 PG* pg;
1074 const char *get_state_name() { return state_name; }
1075 NamedState(PG *pg_, const char *state_name_)
1076 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1077 pg->pgstate_history.enter(pg, enter_time, state_name);
1078 }
1079 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1080 };
1081
1082
1083
1084protected:
1085
1086 /*
1087 * peer_info -- projected (updates _before_ replicas ack)
1088 * peer_missing -- committed (updates _after_ replicas ack)
1089 */
1090
1091 bool need_up_thru;
1092 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
7c673cae 1093 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
11fdf7f2 1094 map<pg_shard_t, int64_t> peer_bytes; // Peer's num_bytes from peer_info
7c673cae
FG
1095 set<pg_shard_t> peer_purged; // peers purged
1096 map<pg_shard_t, pg_missing_t> peer_missing;
1097 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
1098 set<pg_shard_t> peer_missing_requested;
1099
1100 // i deleted these strays; ignore racing PGInfo from them
1101 set<pg_shard_t> peer_activated;
1102
1103 // primary-only, recovery-only state
1104 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
1105 // which are unfound on the primary
1106 epoch_t last_peering_reset;
1107
11fdf7f2
TL
1108 epoch_t get_last_peering_reset() const {
1109 return last_peering_reset;
1110 }
7c673cae
FG
1111
1112 /* heartbeat peers */
1113 void set_probe_targets(const set<pg_shard_t> &probe_set);
1114 void clear_probe_targets();
11fdf7f2 1115
7c673cae
FG
1116 Mutex heartbeat_peer_lock;
1117 set<int> heartbeat_peers;
1118 set<int> probe_targets;
1119
11fdf7f2 1120public:
7c673cae
FG
1121 /**
1122 * BackfillInterval
1123 *
1124 * Represents the objects in a range [begin, end)
1125 *
1126 * Possible states:
1127 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1128 * 2) Else, objects contains all objects in [begin, end)
1129 */
1130 struct BackfillInterval {
1131 // info about a backfill interval on a peer
1132 eversion_t version; /// version at which the scan occurred
1133 map<hobject_t,eversion_t> objects;
1134 hobject_t begin;
1135 hobject_t end;
1136
1137 /// clear content
1138 void clear() {
1139 *this = BackfillInterval();
1140 }
1141
1142 /// clear objects list only
1143 void clear_objects() {
1144 objects.clear();
1145 }
1146
1147 /// reinstantiate with a new start+end position and sort order
1148 void reset(hobject_t start) {
1149 clear();
1150 begin = end = start;
1151 }
1152
1153 /// true if there are no objects in this interval
1154 bool empty() const {
1155 return objects.empty();
1156 }
1157
1158 /// true if interval extends to the end of the range
1159 bool extends_to_end() const {
1160 return end.is_max();
1161 }
1162
1163 /// removes items <= soid and adjusts begin to the first object
1164 void trim_to(const hobject_t &soid) {
1165 trim();
1166 while (!objects.empty() &&
1167 objects.begin()->first <= soid) {
1168 pop_front();
1169 }
1170 }
1171
1172 /// Adjusts begin to the first object
1173 void trim() {
1174 if (!objects.empty())
1175 begin = objects.begin()->first;
1176 else
1177 begin = end;
1178 }
1179
1180 /// drop first entry, and adjust @begin accordingly
1181 void pop_front() {
11fdf7f2 1182 ceph_assert(!objects.empty());
7c673cae
FG
1183 objects.erase(objects.begin());
1184 trim();
1185 }
1186
1187 /// dump
1188 void dump(Formatter *f) const {
1189 f->dump_stream("begin") << begin;
1190 f->dump_stream("end") << end;
1191 f->open_array_section("objects");
1192 for (map<hobject_t, eversion_t>::const_iterator i =
1193 objects.begin();
1194 i != objects.end();
1195 ++i) {
1196 f->open_object_section("object");
1197 f->dump_stream("object") << i->first;
1198 f->dump_stream("version") << i->second;
1199 f->close_section();
1200 }
1201 f->close_section();
1202 }
1203 };
1204
1205protected:
1206 BackfillInterval backfill_info;
1207 map<pg_shard_t, BackfillInterval> peer_backfill_info;
1208 bool backfill_reserved;
1209 bool backfill_reserving;
1210
11fdf7f2 1211 set<pg_shard_t> backfill_targets, async_recovery_targets;
7c673cae 1212
11fdf7f2
TL
1213 // The primary's num_bytes and local num_bytes for this pg, only valid
1214 // during backfill for non-primary shards.
1215 // Both of these are adjusted for EC to reflect the on-disk bytes
1216 std::atomic<int64_t> primary_num_bytes = 0;
1217 std::atomic<int64_t> local_num_bytes = 0;
7c673cae 1218
11fdf7f2 1219public:
7c673cae
FG
1220 bool is_backfill_targets(pg_shard_t osd) {
1221 return backfill_targets.count(osd);
1222 }
1223
11fdf7f2
TL
1224 // Space reserved for backfill is primary_num_bytes - local_num_bytes
1225 // Don't care that difference itself isn't atomic
1226 uint64_t get_reserved_num_bytes() {
1227 int64_t primary = primary_num_bytes.load();
1228 int64_t local = local_num_bytes.load();
1229 if (primary > local)
1230 return primary - local;
1231 else
1232 return 0;
1233 }
1234
1235 bool is_remote_backfilling() {
1236 return primary_num_bytes.load() > 0;
1237 }
1238
1239 void set_reserved_num_bytes(int64_t primary, int64_t local);
1240 void clear_reserved_num_bytes();
1241
1242 // If num_bytes are inconsistent and local_num- goes negative
1243 // it's ok, because it would then be ignored.
1244
1245 // The value of num_bytes could be negative,
1246 // but we don't let local_num_bytes go negative.
1247 void add_local_num_bytes(int64_t num_bytes) {
1248 if (num_bytes) {
1249 int64_t prev_bytes = local_num_bytes.load();
1250 int64_t new_bytes;
1251 do {
1252 new_bytes = prev_bytes + num_bytes;
1253 if (new_bytes < 0)
1254 new_bytes = 0;
1255 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1256 }
1257 }
1258 void sub_local_num_bytes(int64_t num_bytes) {
1259 ceph_assert(num_bytes >= 0);
1260 if (num_bytes) {
1261 int64_t prev_bytes = local_num_bytes.load();
1262 int64_t new_bytes;
1263 do {
1264 new_bytes = prev_bytes - num_bytes;
1265 if (new_bytes < 0)
1266 new_bytes = 0;
1267 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1268 }
1269 }
1270 // The value of num_bytes could be negative,
1271 // but we don't let info.stats.stats.sum.num_bytes go negative.
1272 void add_num_bytes(int64_t num_bytes) {
1273 ceph_assert(_lock.is_locked_by_me());
1274 if (num_bytes) {
1275 info.stats.stats.sum.num_bytes += num_bytes;
1276 if (info.stats.stats.sum.num_bytes < 0) {
1277 info.stats.stats.sum.num_bytes = 0;
1278 }
1279 }
1280 }
1281 void sub_num_bytes(int64_t num_bytes) {
1282 ceph_assert(_lock.is_locked_by_me());
1283 ceph_assert(num_bytes >= 0);
1284 if (num_bytes) {
1285 info.stats.stats.sum.num_bytes -= num_bytes;
1286 if (info.stats.stats.sum.num_bytes < 0) {
1287 info.stats.stats.sum.num_bytes = 0;
1288 }
1289 }
1290 }
1291
1292 // Only used in testing so not worried about needing the PG lock here
1293 int64_t get_stats_num_bytes() {
1294 Mutex::Locker l(_lock);
1295 int num_bytes = info.stats.stats.sum.num_bytes;
1296 if (pool.info.is_erasure()) {
1297 num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1298 // Round up each object by a stripe
1299 num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1300 }
1301 int64_t lnb = local_num_bytes.load();
1302 if (lnb && lnb != num_bytes) {
1303 lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1304 << lnb << " vs stats "
1305 << info.stats.stats.sum.num_bytes << " / chunk "
1306 << get_pgbackend()->get_ec_data_chunk_count()
1307 << dendl;
1308 }
1309 return num_bytes;
1310 }
1311
7c673cae
FG
1312protected:
1313
1314 /*
1315 * blocked request wait hierarchy
1316 *
1317 * In order to preserve request ordering we need to be careful about the
1318 * order in which blocked requests get requeued. Generally speaking, we
1319 * push the requests back up to the op_wq in reverse order (most recent
1320 * request first) so that they come back out again in the original order.
1321 * However, because there are multiple wait queues, we need to requeue
1322 * waitlists in order. Generally speaking, we requeue the wait lists
1323 * that are checked first.
1324 *
1325 * Here are the various wait lists, in the order they are used during
1326 * request processing, with notes:
1327 *
1328 * - waiting_for_map
1329 * - may start or stop blocking at any time (depending on client epoch)
1330 * - waiting_for_peered
1331 * - !is_peered() or flushes_in_progress
1332 * - only starts blocking on interval change; never restarts
1333 * - waiting_for_active
1334 * - !is_active()
1335 * - only starts blocking on interval change; never restarts
b32b8144
FG
1336 * - waiting_for_flush
1337 * - is_active() and flushes_in_progress
1338 * - waiting for final flush during activate
7c673cae
FG
1339 * - waiting_for_scrub
1340 * - starts and stops blocking for varying intervals during scrub
1341 * - waiting_for_unreadable_object
1342 * - never restarts once object is readable (* except for EIO?)
1343 * - waiting_for_degraded_object
1344 * - never restarts once object is writeable (* except for EIO?)
1345 * - waiting_for_blocked_object
1346 * - starts and stops based on proxied op activity
1347 * - obc rwlocks
1348 * - starts and stops based on read/write activity
1349 *
1350 * Notes:
1351 *
1352 * 1. During and interval change, we requeue *everything* in the above order.
1353 *
1354 * 2. When an obc rwlock is released, we check for a scrub block and requeue
1355 * the op there if it applies. We ignore the unreadable/degraded/blocked
1356 * queues because we assume they cannot apply at that time (this is
1357 * probably mostly true).
1358 *
1359 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
1360 * it is non-empty.
1361 *
1362 * These three behaviors are generally sufficient to maintain ordering, with
1363 * the possible exception of cases where we make an object degraded or
1364 * unreadable that was previously okay, e.g. when scrub or op processing
1365 * encounter an unexpected error. FIXME.
1366 */
1367
1368 // pg waiters
1369 unsigned flushes_in_progress;
1370
1371 // ops with newer maps than our (or blocked behind them)
1372 // track these by client, since inter-request ordering doesn't otherwise
1373 // matter.
1374 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1375
1376 // ops waiting on peered
1377 list<OpRequestRef> waiting_for_peered;
1378
1379 // ops waiting on active (require peered as well)
1380 list<OpRequestRef> waiting_for_active;
b32b8144 1381 list<OpRequestRef> waiting_for_flush;
7c673cae
FG
1382 list<OpRequestRef> waiting_for_scrub;
1383
1384 list<OpRequestRef> waiting_for_cache_not_full;
224ce89b 1385 list<OpRequestRef> waiting_for_clean_to_primary_repair;
7c673cae
FG
1386 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1387 waiting_for_degraded_object,
1388 waiting_for_blocked_object;
1389
1390 set<hobject_t> objects_blocked_on_cache_full;
1391 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1392 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1393
1394 // Callbacks should assume pg (and nothing else) is locked
1395 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1396
1397 map<eversion_t,
11fdf7f2 1398 list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
7c673cae
FG
1399
1400 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1401 void requeue_op(OpRequestRef op);
1402 void requeue_ops(list<OpRequestRef> &l);
1403
1404 // stats that persist lazily
1405 object_stat_collection_t unstable_stats;
1406
1407 // publish stats
1408 Mutex pg_stats_publish_lock;
1409 bool pg_stats_publish_valid;
1410 pg_stat_t pg_stats_publish;
1411
7c673cae
FG
1412 void _update_calc_stats();
1413 void _update_blocked_by();
a8e16298 1414 friend class TestOpsSocketHook;
7c673cae
FG
1415 void publish_stats_to_osd();
1416 void clear_publish_stats();
1417
7c673cae
FG
1418 void clear_primary_state();
1419
11fdf7f2
TL
1420 bool is_acting_recovery_backfill(pg_shard_t osd) const {
1421 return acting_recovery_backfill.count(osd);
7c673cae
FG
1422 }
1423 bool is_acting(pg_shard_t osd) const {
11fdf7f2 1424 return has_shard(pool.info.is_erasure(), acting, osd);
7c673cae
FG
1425 }
1426 bool is_up(pg_shard_t osd) const {
11fdf7f2 1427 return has_shard(pool.info.is_erasure(), up, osd);
7c673cae
FG
1428 }
1429 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1430 if (ec) {
1431 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1432 } else {
1433 return std::find(v.begin(), v.end(), osd.osd) != v.end();
1434 }
1435 }
1436
1437 bool needs_recovery() const;
1438 bool needs_backfill() const;
1439
c07f9fc5 1440 /// clip calculated priority to reasonable range
81eedcae 1441 int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
7c673cae
FG
1442 /// get log recovery reservation priority
1443 unsigned get_recovery_priority();
1444 /// get backfill reservation priority
1445 unsigned get_backfill_priority();
11fdf7f2
TL
1446 /// get priority for pg deletion
1447 unsigned get_delete_priority();
7c673cae 1448
11fdf7f2 1449 void try_mark_clean(); ///< mark an active pg clean
7c673cae
FG
1450
1451 /// return [start,end) bounds for required past_intervals
1452 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1453 const pg_info_t &info,
1454 epoch_t oldest_map) {
11fdf7f2 1455 epoch_t start = std::max(
7c673cae 1456 info.history.last_epoch_clean ? info.history.last_epoch_clean :
31f18b77 1457 info.history.epoch_pool_created,
7c673cae 1458 oldest_map);
11fdf7f2 1459 epoch_t end = std::max(
7c673cae 1460 info.history.same_interval_since,
31f18b77 1461 info.history.epoch_pool_created);
7c673cae
FG
1462 return make_pair(start, end);
1463 }
1464 void check_past_interval_bounds() const;
1465 PastIntervals::PriorSet build_prior();
1466
1467 void remove_down_peer_info(const OSDMapRef osdmap);
1468
1469 bool adjust_need_up_thru(const OSDMapRef osdmap);
1470
1471 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1472 virtual void dump_recovery_info(Formatter *f) const = 0;
1473
11fdf7f2 1474 void calc_min_last_complete_ondisk() {
7c673cae 1475 eversion_t min = last_complete_ondisk;
11fdf7f2
TL
1476 ceph_assert(!acting_recovery_backfill.empty());
1477 for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1478 i != acting_recovery_backfill.end();
7c673cae
FG
1479 ++i) {
1480 if (*i == get_primary()) continue;
1481 if (peer_last_complete_ondisk.count(*i) == 0)
11fdf7f2 1482 return; // we don't have complete info
7c673cae
FG
1483 eversion_t a = peer_last_complete_ondisk[*i];
1484 if (a < min)
1485 min = a;
1486 }
1487 if (min == min_last_complete_ondisk)
11fdf7f2 1488 return;
7c673cae 1489 min_last_complete_ondisk = min;
11fdf7f2 1490 return;
7c673cae
FG
1491 }
1492
1493 virtual void calc_trim_to() = 0;
1494
f64942e4
AA
1495 virtual void calc_trim_to_aggressive() = 0;
1496
7c673cae
FG
1497 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1498 pg_missing_t& omissing, pg_shard_t from);
1499 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1500 pg_missing_t& omissing, pg_shard_t from);
1501 bool proc_replica_info(
1502 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1503
1504 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1505 PG *pg;
1506 ObjectStore::Transaction *t;
1507 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1508
1509 // LogEntryHandler
1510 void remove(const hobject_t &hoid) override {
1511 pg->get_pgbackend()->remove(hoid, t);
1512 }
1513 void try_stash(const hobject_t &hoid, version_t v) override {
1514 pg->get_pgbackend()->try_stash(hoid, v, t);
1515 }
1516 void rollback(const pg_log_entry_t &entry) override {
11fdf7f2 1517 ceph_assert(entry.can_rollback());
7c673cae
FG
1518 pg->get_pgbackend()->rollback(entry, t);
1519 }
1520 void rollforward(const pg_log_entry_t &entry) override {
1521 pg->get_pgbackend()->rollforward(entry, t);
1522 }
1523 void trim(const pg_log_entry_t &entry) override {
1524 pg->get_pgbackend()->trim(entry, t);
1525 }
1526 };
1527
1528 void update_object_snap_mapping(
1529 ObjectStore::Transaction *t, const hobject_t &soid,
1530 const set<snapid_t> &snaps);
1531 void clear_object_snap_mapping(
1532 ObjectStore::Transaction *t, const hobject_t &soid);
1533 void remove_snap_mapped_object(
1534 ObjectStore::Transaction& t, const hobject_t& soid);
1535 void merge_log(
1536 ObjectStore::Transaction& t, pg_info_t &oinfo,
1537 pg_log_t &olog, pg_shard_t from);
1538 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1539 bool search_for_missing(
1540 const pg_info_t &oinfo, const pg_missing_t &omissing,
1541 pg_shard_t fromosd,
1542 RecoveryCtx*);
1543
7c673cae
FG
1544 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1545
7c673cae
FG
1546 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1547 const map<pg_shard_t, pg_info_t> &infos,
1548 bool restrict_to_up_acting,
1549 bool *history_les_bound) const;
1550 static void calc_ec_acting(
1551 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1552 unsigned size,
1553 const vector<int> &acting,
7c673cae 1554 const vector<int> &up,
7c673cae 1555 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1556 bool restrict_to_up_acting,
1557 vector<int> *want,
1558 set<pg_shard_t> *backfill,
1559 set<pg_shard_t> *acting_backfill,
7c673cae
FG
1560 ostream &ss);
1561 static void calc_replicated_acting(
1562 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
11fdf7f2 1563 uint64_t force_auth_primary_missing_objects,
7c673cae
FG
1564 unsigned size,
1565 const vector<int> &acting,
7c673cae
FG
1566 const vector<int> &up,
1567 pg_shard_t up_primary,
1568 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1569 bool restrict_to_up_acting,
1570 vector<int> *want,
1571 set<pg_shard_t> *backfill,
1572 set<pg_shard_t> *acting_backfill,
11fdf7f2 1573 const OSDMapRef osdmap,
7c673cae 1574 ostream &ss);
11fdf7f2
TL
1575 void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1576 const pg_info_t &auth_info,
1577 vector<int> *want,
81eedcae
TL
1578 set<pg_shard_t> *async_recovery,
1579 const OSDMapRef osdmap) const;
11fdf7f2
TL
1580 void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1581 const pg_info_t &auth_info,
1582 vector<int> *want,
81eedcae
TL
1583 set<pg_shard_t> *async_recovery,
1584 const OSDMapRef osdmap) const;
11fdf7f2
TL
1585
1586 bool recoverable_and_ge_min_size(const vector<int> &want) const;
7c673cae
FG
1587 bool choose_acting(pg_shard_t &auth_log_shard,
1588 bool restrict_to_up_acting,
1589 bool *history_les_bound);
1590 void build_might_have_unfound();
1591 void activate(
1592 ObjectStore::Transaction& t,
1593 epoch_t activation_epoch,
7c673cae
FG
1594 map<int, map<spg_t,pg_query_t> >& query_map,
1595 map<int,
1596 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1597 RecoveryCtx *ctx);
11fdf7f2
TL
1598
1599 struct C_PG_ActivateCommitted : public Context {
1600 PGRef pg;
1601 epoch_t epoch;
1602 epoch_t activation_epoch;
1603 C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1604 : pg(p), epoch(e), activation_epoch(ae) {}
1605 void finish(int r) override {
1606 pg->_activate_committed(epoch, activation_epoch);
1607 }
1608 };
7c673cae
FG
1609 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1610 void all_activated_and_committed();
1611
1612 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1613
1614 bool have_unfound() const {
1615 return missing_loc.have_unfound();
1616 }
1617 uint64_t get_num_unfound() const {
1618 return missing_loc.num_unfound();
1619 }
81eedcae
TL
1620 bool all_missing_unfound() const {
1621 const auto& missing = pg_log.get_missing();
1622 if (!missing.have_missing())
1623 return false;
1624 for (auto& m : missing.get_items()) {
1625 if (!missing_loc.is_unfound(m.first))
1626 return false;
1627 }
1628 return true;
1629 }
7c673cae
FG
1630
1631 virtual void check_local() = 0;
1632
7c673cae
FG
1633 void purge_strays();
1634
1635 void update_heartbeat_peers();
1636
1637 Context *finish_sync_event;
1638
11fdf7f2 1639 Context *finish_recovery();
7c673cae 1640 void _finish_recovery(Context *c);
11fdf7f2
TL
1641 struct C_PG_FinishRecovery : public Context {
1642 PGRef pg;
1643 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1644 void finish(int r) override {
1645 pg->_finish_recovery(this);
1646 }
1647 };
7c673cae
FG
1648 void cancel_recovery();
1649 void clear_recovery_state();
1650 virtual void _clear_recovery_state() = 0;
1651 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1652 void start_recovery_op(const hobject_t& soid);
1653 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1654
7c673cae
FG
1655 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1656
1657 friend class C_OSD_RepModify_Commit;
11fdf7f2 1658 friend class C_DeleteMore;
7c673cae
FG
1659
1660 // -- backoff --
1661 Mutex backoff_lock; // orders inside Backoff::lock
1662 map<hobject_t,set<BackoffRef>> backoffs;
1663
1664 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1665 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1666 void release_backoffs(const hobject_t& o) {
1667 release_backoffs(o, o);
1668 }
1669 void clear_backoffs();
1670
1671 void add_pg_backoff(SessionRef s) {
1672 hobject_t begin = info.pgid.pgid.get_hobj_start();
1673 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1674 add_backoff(s, begin, end);
1675 }
1676 void release_pg_backoffs() {
1677 hobject_t begin = info.pgid.pgid.get_hobj_start();
1678 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1679 release_backoffs(begin, end);
1680 }
1681
7c673cae 1682 // -- scrub --
11fdf7f2 1683public:
7c673cae
FG
1684 struct Scrubber {
1685 Scrubber();
1686 ~Scrubber();
1687
1688 // metadata
1689 set<pg_shard_t> reserved_peers;
1690 bool reserved, reserve_failed;
1691 epoch_t epoch_start;
1692
1693 // common to both scrubs
1694 bool active;
7c673cae
FG
1695 set<pg_shard_t> waiting_on_whom;
1696 int shallow_errors;
1697 int deep_errors;
1698 int fixed;
1699 ScrubMap primary_scrubmap;
28e407b8
AA
1700 ScrubMapBuilder primary_scrubmap_pos;
1701 epoch_t replica_scrub_start = 0;
1702 ScrubMap replica_scrubmap;
1703 ScrubMapBuilder replica_scrubmap_pos;
7c673cae
FG
1704 map<pg_shard_t, ScrubMap> received_maps;
1705 OpRequestRef active_rep_scrub;
1706 utime_t scrub_reg_stamp; // stamp we registered for
1707
494da23a
TL
1708 static utime_t scrub_must_stamp() { return utime_t(0,1); }
1709
11fdf7f2
TL
1710 omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
1711
7c673cae
FG
1712 // For async sleep
1713 bool sleeping = false;
1714 bool needs_sleep = true;
1715 utime_t sleep_start;
1716
1717 // flags to indicate explicitly requested scrubs (by admin)
494da23a 1718 bool must_scrub, must_deep_scrub, must_repair, need_auto;
7c673cae
FG
1719
1720 // Priority to use for scrub scheduling
11fdf7f2 1721 unsigned priority = 0;
7c673cae 1722
494da23a 1723 bool time_for_deep;
7c673cae
FG
1724 // this flag indicates whether we would like to do auto-repair of the PG or not
1725 bool auto_repair;
11fdf7f2
TL
1726 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1727 bool check_repair;
1728 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1729 // we should deep scrub in order to auto repair
1730 bool deep_scrub_on_error;
7c673cae
FG
1731
1732 // Maps from objects with errors to missing/inconsistent peers
1733 map<hobject_t, set<pg_shard_t>> missing;
1734 map<hobject_t, set<pg_shard_t>> inconsistent;
1735
1736 // Map from object with errors to good peers
1737 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1738
1739 // Cleaned map pending snap metadata scrub
1740 ScrubMap cleaned_meta_map;
1741
28e407b8
AA
1742 void clean_meta_map(ScrubMap &for_meta_scrub) {
1743 if (end.is_max() ||
1744 cleaned_meta_map.objects.empty()) {
1745 cleaned_meta_map.swap(for_meta_scrub);
1746 } else {
1747 auto iter = cleaned_meta_map.objects.end();
1748 --iter; // not empty, see if clause
1749 auto begin = cleaned_meta_map.objects.begin();
1750 if (iter->first.has_snapset()) {
1751 ++iter;
1752 } else {
1753 while (iter != begin) {
1754 auto next = iter--;
1755 if (next->first.get_head() != iter->first.get_head()) {
1756 ++iter;
1757 break;
1758 }
1759 }
1760 }
1761 for_meta_scrub.objects.insert(begin, iter);
1762 cleaned_meta_map.objects.erase(begin, iter);
1763 }
1764 }
1765
7c673cae
FG
1766 // digest updates which we are waiting on
1767 int num_digest_updates_pending;
1768
1769 // chunky scrub
28e407b8
AA
1770 hobject_t start, end; // [start,end)
1771 hobject_t max_end; // Largest end that may have been sent to replicas
7c673cae
FG
1772 eversion_t subset_last_update;
1773
1774 // chunky scrub state
1775 enum State {
1776 INACTIVE,
1777 NEW_CHUNK,
1778 WAIT_PUSHES,
1779 WAIT_LAST_UPDATE,
1780 BUILD_MAP,
28e407b8 1781 BUILD_MAP_DONE,
7c673cae
FG
1782 WAIT_REPLICAS,
1783 COMPARE_MAPS,
1784 WAIT_DIGEST_UPDATES,
1785 FINISH,
28e407b8 1786 BUILD_MAP_REPLICA,
7c673cae
FG
1787 } state;
1788
1789 std::unique_ptr<Scrub::Store> store;
1790 // deep scrub
1791 bool deep;
28e407b8
AA
1792 int preempt_left;
1793 int preempt_divisor;
7c673cae
FG
1794
1795 list<Context*> callbacks;
1796 void add_callback(Context *context) {
1797 callbacks.push_back(context);
1798 }
1799 void run_callbacks() {
1800 list<Context*> to_run;
1801 to_run.swap(callbacks);
1802 for (list<Context*>::iterator i = to_run.begin();
1803 i != to_run.end();
1804 ++i) {
1805 (*i)->complete(0);
1806 }
1807 }
1808
1809 static const char *state_string(const PG::Scrubber::State& state) {
1810 const char *ret = NULL;
1811 switch( state )
1812 {
1813 case INACTIVE: ret = "INACTIVE"; break;
1814 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1815 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1816 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1817 case BUILD_MAP: ret = "BUILD_MAP"; break;
28e407b8 1818 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
7c673cae
FG
1819 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1820 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1821 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1822 case FINISH: ret = "FINISH"; break;
28e407b8 1823 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
7c673cae
FG
1824 }
1825 return ret;
1826 }
1827
1828 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1829
7c673cae
FG
1830 // clear all state
1831 void reset() {
1832 active = false;
7c673cae
FG
1833 waiting_on_whom.clear();
1834 if (active_rep_scrub) {
1835 active_rep_scrub = OpRequestRef();
1836 }
1837 received_maps.clear();
1838
1839 must_scrub = false;
1840 must_deep_scrub = false;
1841 must_repair = false;
494da23a
TL
1842 need_auto = false;
1843 time_for_deep = false;
7c673cae 1844 auto_repair = false;
11fdf7f2
TL
1845 check_repair = false;
1846 deep_scrub_on_error = false;
7c673cae
FG
1847
1848 state = PG::Scrubber::INACTIVE;
1849 start = hobject_t();
1850 end = hobject_t();
28e407b8 1851 max_end = hobject_t();
7c673cae
FG
1852 subset_last_update = eversion_t();
1853 shallow_errors = 0;
1854 deep_errors = 0;
1855 fixed = 0;
11fdf7f2 1856 omap_stats = (const struct omap_stat_t){ 0 };
7c673cae 1857 deep = false;
7c673cae
FG
1858 run_callbacks();
1859 inconsistent.clear();
1860 missing.clear();
1861 authoritative.clear();
1862 num_digest_updates_pending = 0;
28e407b8
AA
1863 primary_scrubmap = ScrubMap();
1864 primary_scrubmap_pos.reset();
1865 replica_scrubmap = ScrubMap();
1866 replica_scrubmap_pos.reset();
7c673cae
FG
1867 cleaned_meta_map = ScrubMap();
1868 sleeping = false;
1869 needs_sleep = true;
1870 sleep_start = utime_t();
1871 }
1872
1873 void create_results(const hobject_t& obj);
1874 void cleanup_store(ObjectStore::Transaction *t);
1875 } scrubber;
1876
11fdf7f2 1877protected:
7c673cae
FG
1878 bool scrub_after_recovery;
1879
1880 int active_pushes;
1881
28e407b8
AA
1882 bool scrub_can_preempt = false;
1883 bool scrub_preempted = false;
1884
1885 // we allow some number of preemptions of the scrub, which mean we do
1886 // not block. then we start to block. once we start blocking, we do
1887 // not stop until the scrub range is completed.
1888 bool write_blocked_by_scrub(const hobject_t &soid);
1889
1890 /// true if the given range intersects the scrub interval in any way
1891 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1892
7c673cae
FG
1893 void repair_object(
1894 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1895 pg_shard_t bad_peer);
1896
7c673cae
FG
1897 void chunky_scrub(ThreadPool::TPHandle &handle);
1898 void scrub_compare_maps();
1899 /**
1900 * return true if any inconsistency/missing is repaired, false otherwise
1901 */
1902 bool scrub_process_inconsistent();
31f18b77 1903 bool ops_blocked_by_scrub() const;
7c673cae 1904 void scrub_finish();
11fdf7f2 1905 void scrub_clear_state(bool keep_repair = false);
7c673cae 1906 void _scan_snaps(ScrubMap &map);
224ce89b 1907 void _repair_oinfo_oid(ScrubMap &map);
11fdf7f2 1908 void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
7c673cae
FG
1909 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1910 hobject_t start, hobject_t end, bool deep,
28e407b8 1911 bool allow_preemption);
7c673cae
FG
1912 int build_scrub_map_chunk(
1913 ScrubMap &map,
28e407b8
AA
1914 ScrubMapBuilder &pos,
1915 hobject_t start, hobject_t end, bool deep,
7c673cae
FG
1916 ThreadPool::TPHandle &handle);
1917 /**
1918 * returns true if [begin, end) is good to scrub at this time
1919 * a false return value obliges the implementer to requeue scrub when the
1920 * condition preventing scrub clears
1921 */
1922 virtual bool _range_available_for_scrub(
1923 const hobject_t &begin, const hobject_t &end) = 0;
1924 virtual void scrub_snapshot_metadata(
1925 ScrubMap &map,
28e407b8
AA
1926 const std::map<hobject_t,
1927 pair<boost::optional<uint32_t>,
1928 boost::optional<uint32_t>>> &missing_digest) { }
7c673cae
FG
1929 virtual void _scrub_clear_state() { }
1930 virtual void _scrub_finish() { }
7c673cae
FG
1931 void clear_scrub_reserved();
1932 void scrub_reserve_replicas();
1933 void scrub_unreserve_replicas();
1934 bool scrub_all_replicas_reserved() const;
7c673cae
FG
1935
1936 void replica_scrub(
1937 OpRequestRef op,
1938 ThreadPool::TPHandle &handle);
1939 void do_replica_scrub_map(OpRequestRef op);
7c673cae
FG
1940
1941 void handle_scrub_reserve_request(OpRequestRef op);
1942 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1943 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1944 void handle_scrub_reserve_release(OpRequestRef op);
1945
1946 void reject_reservation();
3efd9988
FG
1947 void schedule_backfill_retry(float retry);
1948 void schedule_recovery_retry(float retry);
7c673cae
FG
1949
1950 // -- recovery state --
1951
1952 template <class EVT>
1953 struct QueuePeeringEvt : Context {
1954 PGRef pg;
1955 epoch_t epoch;
1956 EVT evt;
1957 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1958 pg(pg), epoch(epoch), evt(evt) {}
1959 void finish(int r) override {
1960 pg->lock();
11fdf7f2
TL
1961 pg->queue_peering_event(PGPeeringEventRef(
1962 new PGPeeringEvent(
7c673cae
FG
1963 epoch,
1964 epoch,
1965 evt)));
1966 pg->unlock();
1967 }
1968 };
1969
7c673cae
FG
1970
1971 struct QueryState : boost::statechart::event< QueryState > {
1972 Formatter *f;
1973 explicit QueryState(Formatter *f) : f(f) {}
1974 void print(std::ostream *out) const {
1975 *out << "Query";
1976 }
1977 };
1978
11fdf7f2
TL
1979public:
1980 int pg_stat_adjust(osd_stat_t *new_stat);
1981protected:
7c673cae
FG
1982
1983 struct AdvMap : boost::statechart::event< AdvMap > {
1984 OSDMapRef osdmap;
1985 OSDMapRef lastmap;
1986 vector<int> newup, newacting;
1987 int up_primary, acting_primary;
1988 AdvMap(
1989 OSDMapRef osdmap, OSDMapRef lastmap,
1990 vector<int>& newup, int up_primary,
1991 vector<int>& newacting, int acting_primary):
1992 osdmap(osdmap), lastmap(lastmap),
1993 newup(newup),
1994 newacting(newacting),
1995 up_primary(up_primary),
1996 acting_primary(acting_primary) {}
1997 void print(std::ostream *out) const {
1998 *out << "AdvMap";
1999 }
2000 };
2001
2002 struct ActMap : boost::statechart::event< ActMap > {
2003 ActMap() : boost::statechart::event< ActMap >() {}
2004 void print(std::ostream *out) const {
2005 *out << "ActMap";
2006 }
2007 };
2008 struct Activate : boost::statechart::event< Activate > {
2009 epoch_t activation_epoch;
2010 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2011 activation_epoch(q) {}
2012 void print(std::ostream *out) const {
2013 *out << "Activate from " << activation_epoch;
2014 }
2015 };
11fdf7f2 2016public:
b32b8144
FG
2017 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2018 explicit UnfoundBackfill() {}
2019 void print(std::ostream *out) const {
2020 *out << "UnfoundBackfill";
2021 }
2022 };
2023 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2024 explicit UnfoundRecovery() {}
2025 void print(std::ostream *out) const {
2026 *out << "UnfoundRecovery";
2027 }
2028 };
11fdf7f2
TL
2029
2030 struct RequestScrub : boost::statechart::event<RequestScrub> {
2031 bool deep;
2032 bool repair;
2033 explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2034 void print(std::ostream *out) const {
2035 *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2036 << (repair ? " repair" : "");
2037 }
2038 };
2039
b32b8144 2040protected:
7c673cae 2041 TrivialEvent(Initialize)
7c673cae
FG
2042 TrivialEvent(GotInfo)
2043 TrivialEvent(NeedUpThru)
7c673cae
FG
2044 TrivialEvent(Backfilled)
2045 TrivialEvent(LocalBackfillReserved)
3efd9988 2046 TrivialEvent(RejectRemoteReservation)
11fdf7f2 2047 public:
7c673cae 2048 TrivialEvent(RequestBackfill)
11fdf7f2
TL
2049 protected:
2050 TrivialEvent(RemoteRecoveryPreempted)
2051 TrivialEvent(RemoteBackfillPreempted)
7c673cae
FG
2052 TrivialEvent(BackfillTooFull)
2053 TrivialEvent(RecoveryTooFull)
3efd9988
FG
2054
2055 TrivialEvent(MakePrimary)
2056 TrivialEvent(MakeStray)
2057 TrivialEvent(NeedActingChange)
2058 TrivialEvent(IsIncomplete)
2059 TrivialEvent(IsDown)
7c673cae
FG
2060
2061 TrivialEvent(AllReplicasRecovered)
2062 TrivialEvent(DoRecovery)
2063 TrivialEvent(LocalRecoveryReserved)
11fdf7f2
TL
2064 public:
2065 protected:
7c673cae
FG
2066 TrivialEvent(AllRemotesReserved)
2067 TrivialEvent(AllBackfillsReserved)
2068 TrivialEvent(GoClean)
2069
2070 TrivialEvent(AllReplicasActivated)
2071
2072 TrivialEvent(IntervalFlush)
2073
11fdf7f2
TL
2074 public:
2075 TrivialEvent(DeleteStart)
2076 TrivialEvent(DeleteSome)
2077
2078 TrivialEvent(SetForceRecovery)
2079 TrivialEvent(UnsetForceRecovery)
2080 TrivialEvent(SetForceBackfill)
2081 TrivialEvent(UnsetForceBackfill)
2082
2083 protected:
2084 TrivialEvent(DeleteReserved)
2085 TrivialEvent(DeleteInterrupted)
2086
7c673cae
FG
2087 /* Encapsulates PG recovery process */
2088 class RecoveryState {
2089 void start_handle(RecoveryCtx *new_ctx);
2090 void end_handle();
2091 public:
2092 void begin_block_outgoing();
2093 void end_block_outgoing();
2094 void clear_blocked_outgoing();
2095 private:
2096
2097 /* States */
2098 struct Initial;
2099 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2100 RecoveryState *state;
2101 public:
2102 PG *pg;
2103
2104 utime_t event_time;
2105 uint64_t event_count;
2106
2107 void clear_event_counters() {
2108 event_time = utime_t();
2109 event_count = 0;
2110 }
2111
2112 void log_enter(const char *state_name);
2113 void log_exit(const char *state_name, utime_t duration);
2114
2115 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2116
2117 /* Accessor functions for state methods */
2118 ObjectStore::Transaction* get_cur_transaction() {
11fdf7f2
TL
2119 ceph_assert(state->rctx);
2120 ceph_assert(state->rctx->transaction);
7c673cae
FG
2121 return state->rctx->transaction;
2122 }
2123
2124 void send_query(pg_shard_t to, const pg_query_t &query) {
11fdf7f2
TL
2125 ceph_assert(state->rctx);
2126 ceph_assert(state->rctx->query_map);
7c673cae
FG
2127 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2128 query;
2129 }
2130
2131 map<int, map<spg_t, pg_query_t> > *get_query_map() {
11fdf7f2
TL
2132 ceph_assert(state->rctx);
2133 ceph_assert(state->rctx->query_map);
7c673cae
FG
2134 return state->rctx->query_map;
2135 }
2136
2137 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
11fdf7f2
TL
2138 ceph_assert(state->rctx);
2139 ceph_assert(state->rctx->info_map);
7c673cae
FG
2140 return state->rctx->info_map;
2141 }
2142
7c673cae
FG
2143 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2144
2145 void send_notify(pg_shard_t to,
2146 const pg_notify_t &info, const PastIntervals &pi) {
11fdf7f2 2147 ceph_assert(state->rctx);
1adf2230 2148 state->rctx->send_notify(to, info, pi);
7c673cae
FG
2149 }
2150 };
2151 friend class RecoveryMachine;
2152
2153 /* States */
b32b8144
FG
2154 // Initial
2155 // Reset
2156 // Start
2157 // Started
2158 // Primary
2159 // WaitActingChange
2160 // Peering
2161 // GetInfo
2162 // GetLog
2163 // GetMissing
2164 // WaitUpThru
2165 // Incomplete
2166 // Active
2167 // Activating
2168 // Clean
2169 // Recovered
2170 // Backfilling
2171 // WaitRemoteBackfillReserved
2172 // WaitLocalBackfillReserved
2173 // NotBackfilling
2174 // NotRecovering
2175 // Recovering
2176 // WaitRemoteRecoveryReserved
2177 // WaitLocalRecoveryReserved
2178 // ReplicaActive
2179 // RepNotRecovering
2180 // RepRecovering
2181 // RepWaitBackfillReserved
2182 // RepWaitRecoveryReserved
2183 // Stray
11fdf7f2
TL
2184 // ToDelete
2185 // WaitDeleteReserved
2186 // Deleting
2187 // Crashed
7c673cae
FG
2188
2189 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2190 explicit Crashed(my_context ctx);
2191 };
2192
2193 struct Reset;
2194
2195 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2196 explicit Initial(my_context ctx);
2197 void exit();
2198
2199 typedef boost::mpl::list <
2200 boost::statechart::transition< Initialize, Reset >,
7c673cae
FG
2201 boost::statechart::custom_reaction< NullEvt >,
2202 boost::statechart::transition< boost::statechart::event_base, Crashed >
2203 > reactions;
2204
7c673cae
FG
2205 boost::statechart::result react(const MNotifyRec&);
2206 boost::statechart::result react(const MInfoRec&);
2207 boost::statechart::result react(const MLogRec&);
2208 boost::statechart::result react(const boost::statechart::event_base&) {
2209 return discard_event();
2210 }
2211 };
2212
2213 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2214 explicit Reset(my_context ctx);
2215 void exit();
2216
2217 typedef boost::mpl::list <
2218 boost::statechart::custom_reaction< QueryState >,
2219 boost::statechart::custom_reaction< AdvMap >,
2220 boost::statechart::custom_reaction< ActMap >,
2221 boost::statechart::custom_reaction< NullEvt >,
7c673cae
FG
2222 boost::statechart::custom_reaction< IntervalFlush >,
2223 boost::statechart::transition< boost::statechart::event_base, Crashed >
2224 > reactions;
2225 boost::statechart::result react(const QueryState& q);
2226 boost::statechart::result react(const AdvMap&);
2227 boost::statechart::result react(const ActMap&);
7c673cae
FG
2228 boost::statechart::result react(const IntervalFlush&);
2229 boost::statechart::result react(const boost::statechart::event_base&) {
2230 return discard_event();
2231 }
2232 };
2233
2234 struct Start;
2235
2236 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2237 explicit Started(my_context ctx);
2238 void exit();
2239
2240 typedef boost::mpl::list <
2241 boost::statechart::custom_reaction< QueryState >,
2242 boost::statechart::custom_reaction< AdvMap >,
7c673cae 2243 boost::statechart::custom_reaction< IntervalFlush >,
11fdf7f2
TL
2244 // ignored
2245 boost::statechart::custom_reaction< NullEvt >,
2246 boost::statechart::custom_reaction<SetForceRecovery>,
2247 boost::statechart::custom_reaction<UnsetForceRecovery>,
2248 boost::statechart::custom_reaction<SetForceBackfill>,
2249 boost::statechart::custom_reaction<UnsetForceBackfill>,
2250 boost::statechart::custom_reaction<RequestScrub>,
2251 // crash
7c673cae
FG
2252 boost::statechart::transition< boost::statechart::event_base, Crashed >
2253 > reactions;
2254 boost::statechart::result react(const QueryState& q);
2255 boost::statechart::result react(const AdvMap&);
7c673cae
FG
2256 boost::statechart::result react(const IntervalFlush&);
2257 boost::statechart::result react(const boost::statechart::event_base&) {
2258 return discard_event();
2259 }
2260 };
2261
7c673cae
FG
2262 struct Primary;
2263 struct Stray;
2264
2265 struct Start : boost::statechart::state< Start, Started >, NamedState {
2266 explicit Start(my_context ctx);
2267 void exit();
2268
2269 typedef boost::mpl::list <
2270 boost::statechart::transition< MakePrimary, Primary >,
2271 boost::statechart::transition< MakeStray, Stray >
2272 > reactions;
2273 };
2274
2275 struct Peering;
2276 struct WaitActingChange;
7c673cae 2277 struct Incomplete;
7c673cae 2278 struct Down;
7c673cae
FG
2279
2280 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2281 explicit Primary(my_context ctx);
2282 void exit();
2283
2284 typedef boost::mpl::list <
2285 boost::statechart::custom_reaction< ActMap >,
2286 boost::statechart::custom_reaction< MNotifyRec >,
11fdf7f2
TL
2287 boost::statechart::transition< NeedActingChange, WaitActingChange >,
2288 boost::statechart::custom_reaction<SetForceRecovery>,
2289 boost::statechart::custom_reaction<UnsetForceRecovery>,
2290 boost::statechart::custom_reaction<SetForceBackfill>,
2291 boost::statechart::custom_reaction<UnsetForceBackfill>,
2292 boost::statechart::custom_reaction<RequestScrub>
7c673cae
FG
2293 > reactions;
2294 boost::statechart::result react(const ActMap&);
2295 boost::statechart::result react(const MNotifyRec&);
11fdf7f2
TL
2296 boost::statechart::result react(const SetForceRecovery&);
2297 boost::statechart::result react(const UnsetForceRecovery&);
2298 boost::statechart::result react(const SetForceBackfill&);
2299 boost::statechart::result react(const UnsetForceBackfill&);
2300 boost::statechart::result react(const RequestScrub&);
7c673cae
FG
2301 };
2302
2303 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2304 NamedState {
2305 typedef boost::mpl::list <
2306 boost::statechart::custom_reaction< QueryState >,
2307 boost::statechart::custom_reaction< AdvMap >,
2308 boost::statechart::custom_reaction< MLogRec >,
2309 boost::statechart::custom_reaction< MInfoRec >,
2310 boost::statechart::custom_reaction< MNotifyRec >
2311 > reactions;
2312 explicit WaitActingChange(my_context ctx);
2313 boost::statechart::result react(const QueryState& q);
2314 boost::statechart::result react(const AdvMap&);
2315 boost::statechart::result react(const MLogRec&);
2316 boost::statechart::result react(const MInfoRec&);
2317 boost::statechart::result react(const MNotifyRec&);
2318 void exit();
2319 };
2320
2321 struct GetInfo;
2322 struct Active;
2323
2324 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2325 PastIntervals::PriorSet prior_set;
2326 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
2327
2328 explicit Peering(my_context ctx);
2329 void exit();
2330
2331 typedef boost::mpl::list <
2332 boost::statechart::custom_reaction< QueryState >,
2333 boost::statechart::transition< Activate, Active >,
2334 boost::statechart::custom_reaction< AdvMap >
2335 > reactions;
2336 boost::statechart::result react(const QueryState& q);
2337 boost::statechart::result react(const AdvMap &advmap);
2338 };
2339
2340 struct WaitLocalRecoveryReserved;
2341 struct Activating;
2342 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2343 explicit Active(my_context ctx);
2344 void exit();
2345
2346 const set<pg_shard_t> remote_shards_to_reserve_recovery;
2347 const set<pg_shard_t> remote_shards_to_reserve_backfill;
2348 bool all_replicas_activated;
2349
2350 typedef boost::mpl::list <
2351 boost::statechart::custom_reaction< QueryState >,
2352 boost::statechart::custom_reaction< ActMap >,
2353 boost::statechart::custom_reaction< AdvMap >,
2354 boost::statechart::custom_reaction< MInfoRec >,
2355 boost::statechart::custom_reaction< MNotifyRec >,
2356 boost::statechart::custom_reaction< MLogRec >,
11fdf7f2 2357 boost::statechart::custom_reaction< MTrim >,
7c673cae 2358 boost::statechart::custom_reaction< Backfilled >,
3efd9988
FG
2359 boost::statechart::custom_reaction< AllReplicasActivated >,
2360 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2361 boost::statechart::custom_reaction< DeferBackfill >,
11fdf7f2 2362 boost::statechart::custom_reaction< UnfoundRecovery >,
b32b8144 2363 boost::statechart::custom_reaction< UnfoundBackfill >,
11fdf7f2
TL
2364 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2365 boost::statechart::custom_reaction< RemoteReservationRevoked>,
b32b8144 2366 boost::statechart::custom_reaction< DoRecovery>
7c673cae
FG
2367 > reactions;
2368 boost::statechart::result react(const QueryState& q);
2369 boost::statechart::result react(const ActMap&);
2370 boost::statechart::result react(const AdvMap&);
2371 boost::statechart::result react(const MInfoRec& infoevt);
2372 boost::statechart::result react(const MNotifyRec& notevt);
2373 boost::statechart::result react(const MLogRec& logevt);
11fdf7f2 2374 boost::statechart::result react(const MTrim& trimevt);
7c673cae
FG
2375 boost::statechart::result react(const Backfilled&) {
2376 return discard_event();
2377 }
2378 boost::statechart::result react(const AllReplicasActivated&);
3efd9988
FG
2379 boost::statechart::result react(const DeferRecovery& evt) {
2380 return discard_event();
2381 }
2382 boost::statechart::result react(const DeferBackfill& evt) {
2383 return discard_event();
2384 }
b32b8144 2385 boost::statechart::result react(const UnfoundRecovery& evt) {
11fdf7f2 2386 return discard_event();
b32b8144
FG
2387 }
2388 boost::statechart::result react(const UnfoundBackfill& evt) {
11fdf7f2
TL
2389 return discard_event();
2390 }
2391 boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2392 return discard_event();
2393 }
2394 boost::statechart::result react(const RemoteReservationRevoked&) {
2395 return discard_event();
b32b8144
FG
2396 }
2397 boost::statechart::result react(const DoRecovery&) {
2398 return discard_event();
2399 }
7c673cae
FG
2400 };
2401
2402 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2403 typedef boost::mpl::list<
11fdf7f2
TL
2404 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2405 boost::statechart::custom_reaction<SetForceRecovery>,
2406 boost::statechart::custom_reaction<SetForceBackfill>
7c673cae
FG
2407 > reactions;
2408 explicit Clean(my_context ctx);
2409 void exit();
11fdf7f2
TL
2410 boost::statechart::result react(const boost::statechart::event_base&) {
2411 return discard_event();
2412 }
7c673cae
FG
2413 };
2414
2415 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2416 typedef boost::mpl::list<
2417 boost::statechart::transition< GoClean, Clean >,
224ce89b 2418 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
7c673cae
FG
2419 boost::statechart::custom_reaction< AllReplicasActivated >
2420 > reactions;
2421 explicit Recovered(my_context ctx);
2422 void exit();
2423 boost::statechart::result react(const AllReplicasActivated&) {
2424 post_event(GoClean());
2425 return forward_event();
2426 }
2427 };
2428
2429 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2430 typedef boost::mpl::list<
11fdf7f2 2431 boost::statechart::custom_reaction< Backfilled >,
3efd9988 2432 boost::statechart::custom_reaction< DeferBackfill >,
b32b8144 2433 boost::statechart::custom_reaction< UnfoundBackfill >,
11fdf7f2
TL
2434 boost::statechart::custom_reaction< RemoteReservationRejected >,
2435 boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2436 boost::statechart::custom_reaction< RemoteReservationRevoked>
7c673cae
FG
2437 > reactions;
2438 explicit Backfilling(my_context ctx);
11fdf7f2
TL
2439 boost::statechart::result react(const RemoteReservationRejected& evt) {
2440 // for compat with old peers
2441 post_event(RemoteReservationRevokedTooFull());
2442 return discard_event();
2443 }
2444 void backfill_release_reservations();
2445 boost::statechart::result react(const Backfilled& evt);
2446 boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2447 boost::statechart::result react(const RemoteReservationRevoked& evt);
3efd9988 2448 boost::statechart::result react(const DeferBackfill& evt);
b32b8144 2449 boost::statechart::result react(const UnfoundBackfill& evt);
11fdf7f2 2450 void cancel_backfill();
7c673cae
FG
2451 void exit();
2452 };
2453
2454 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2455 typedef boost::mpl::list<
2456 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2457 boost::statechart::custom_reaction< RemoteReservationRejected >,
11fdf7f2 2458 boost::statechart::custom_reaction< RemoteReservationRevoked >,
7c673cae
FG
2459 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2460 > reactions;
2461 set<pg_shard_t>::const_iterator backfill_osd_it;
2462 explicit WaitRemoteBackfillReserved(my_context ctx);
11fdf7f2 2463 void retry();
7c673cae
FG
2464 void exit();
2465 boost::statechart::result react(const RemoteBackfillReserved& evt);
2466 boost::statechart::result react(const RemoteReservationRejected& evt);
11fdf7f2 2467 boost::statechart::result react(const RemoteReservationRevoked& evt);
7c673cae
FG
2468 };
2469
2470 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2471 typedef boost::mpl::list<
2472 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2473 > reactions;
2474 explicit WaitLocalBackfillReserved(my_context ctx);
2475 void exit();
2476 };
2477
2478 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2479 typedef boost::mpl::list<
2480 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2481 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2482 boost::statechart::custom_reaction< RemoteReservationRejected >
2483 > reactions;
2484 explicit NotBackfilling(my_context ctx);
2485 void exit();
2486 boost::statechart::result react(const RemoteBackfillReserved& evt);
2487 boost::statechart::result react(const RemoteReservationRejected& evt);
2488 };
2489
2490 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2491 typedef boost::mpl::list<
c07f9fc5 2492 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
b32b8144
FG
2493 boost::statechart::custom_reaction< DeferRecovery >,
2494 boost::statechart::custom_reaction< UnfoundRecovery >
7c673cae
FG
2495 > reactions;
2496 explicit NotRecovering(my_context ctx);
3efd9988 2497 boost::statechart::result react(const DeferRecovery& evt) {
c07f9fc5
FG
2498 /* no-op */
2499 return discard_event();
2500 }
b32b8144
FG
2501 boost::statechart::result react(const UnfoundRecovery& evt) {
2502 /* no-op */
2503 return discard_event();
2504 }
7c673cae
FG
2505 void exit();
2506 };
2507
11fdf7f2 2508 struct ToDelete;
7c673cae
FG
2509 struct RepNotRecovering;
2510 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2511 explicit ReplicaActive(my_context ctx);
2512 void exit();
2513
2514 typedef boost::mpl::list <
2515 boost::statechart::custom_reaction< QueryState >,
2516 boost::statechart::custom_reaction< ActMap >,
2517 boost::statechart::custom_reaction< MQuery >,
2518 boost::statechart::custom_reaction< MInfoRec >,
2519 boost::statechart::custom_reaction< MLogRec >,
11fdf7f2 2520 boost::statechart::custom_reaction< MTrim >,
3efd9988
FG
2521 boost::statechart::custom_reaction< Activate >,
2522 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
2523 boost::statechart::custom_reaction< DeferBackfill >,
2524 boost::statechart::custom_reaction< UnfoundRecovery >,
11fdf7f2
TL
2525 boost::statechart::custom_reaction< UnfoundBackfill >,
2526 boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2527 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2528 boost::statechart::custom_reaction< RecoveryDone >,
2529 boost::statechart::transition<DeleteStart, ToDelete>
7c673cae
FG
2530 > reactions;
2531 boost::statechart::result react(const QueryState& q);
2532 boost::statechart::result react(const MInfoRec& infoevt);
2533 boost::statechart::result react(const MLogRec& logevt);
11fdf7f2 2534 boost::statechart::result react(const MTrim& trimevt);
7c673cae
FG
2535 boost::statechart::result react(const ActMap&);
2536 boost::statechart::result react(const MQuery&);
2537 boost::statechart::result react(const Activate&);
11fdf7f2
TL
2538 boost::statechart::result react(const RecoveryDone&) {
2539 return discard_event();
2540 }
3efd9988
FG
2541 boost::statechart::result react(const DeferRecovery& evt) {
2542 return discard_event();
2543 }
2544 boost::statechart::result react(const DeferBackfill& evt) {
2545 return discard_event();
2546 }
b32b8144
FG
2547 boost::statechart::result react(const UnfoundRecovery& evt) {
2548 return discard_event();
2549 }
2550 boost::statechart::result react(const UnfoundBackfill& evt) {
2551 return discard_event();
2552 }
11fdf7f2
TL
2553 boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2554 return discard_event();
2555 }
2556 boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2557 return discard_event();
2558 }
7c673cae
FG
2559 };
2560
2561 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2562 typedef boost::mpl::list<
2563 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
3efd9988 2564 // for compat with old peers
7c673cae 2565 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
3efd9988 2566 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
11fdf7f2
TL
2567 boost::statechart::custom_reaction< BackfillTooFull >,
2568 boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2569 boost::statechart::custom_reaction< RemoteBackfillPreempted >
7c673cae
FG
2570 > reactions;
2571 explicit RepRecovering(my_context ctx);
11fdf7f2 2572 boost::statechart::result react(const RemoteRecoveryPreempted &evt);
7c673cae 2573 boost::statechart::result react(const BackfillTooFull &evt);
11fdf7f2 2574 boost::statechart::result react(const RemoteBackfillPreempted &evt);
7c673cae
FG
2575 void exit();
2576 };
2577
2578 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2579 typedef boost::mpl::list<
2580 boost::statechart::custom_reaction< RemoteBackfillReserved >,
3efd9988
FG
2581 boost::statechart::custom_reaction< RejectRemoteReservation >,
2582 boost::statechart::custom_reaction< RemoteReservationRejected >,
2583 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2584 > reactions;
2585 explicit RepWaitBackfillReserved(my_context ctx);
2586 void exit();
2587 boost::statechart::result react(const RemoteBackfillReserved &evt);
3efd9988 2588 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae 2589 boost::statechart::result react(const RemoteReservationRejected &evt);
3efd9988 2590 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2591 };
2592
2593 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2594 typedef boost::mpl::list<
3efd9988
FG
2595 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2596 // for compat with old peers
2597 boost::statechart::custom_reaction< RemoteReservationRejected >,
2598 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2599 > reactions;
2600 explicit RepWaitRecoveryReserved(my_context ctx);
2601 void exit();
2602 boost::statechart::result react(const RemoteRecoveryReserved &evt);
3efd9988
FG
2603 boost::statechart::result react(const RemoteReservationRejected &evt) {
2604 // for compat with old peers
2605 post_event(RemoteReservationCanceled());
2606 return discard_event();
2607 }
2608 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2609 };
2610
2611 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2612 typedef boost::mpl::list<
11fdf7f2 2613 boost::statechart::custom_reaction< RequestRecoveryPrio >,
7c673cae 2614 boost::statechart::custom_reaction< RequestBackfillPrio >,
3efd9988
FG
2615 boost::statechart::custom_reaction< RejectRemoteReservation >,
2616 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2617 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
11fdf7f2
TL
2618 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2619 boost::statechart::custom_reaction< RemoteBackfillReserved >,
7c673cae
FG
2620 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2621 > reactions;
2622 explicit RepNotRecovering(my_context ctx);
11fdf7f2 2623 boost::statechart::result react(const RequestRecoveryPrio &evt);
7c673cae 2624 boost::statechart::result react(const RequestBackfillPrio &evt);
11fdf7f2
TL
2625 boost::statechart::result react(const RemoteBackfillReserved &evt) {
2626 // my reservation completion raced with a RELEASE from primary
2627 return discard_event();
2628 }
2629 boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2630 // my reservation completion raced with a RELEASE from primary
2631 return discard_event();
2632 }
3efd9988 2633 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae
FG
2634 void exit();
2635 };
2636
2637 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2638 typedef boost::mpl::list <
2639 boost::statechart::custom_reaction< AllReplicasRecovered >,
3efd9988 2640 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2641 boost::statechart::custom_reaction< UnfoundRecovery >,
7c673cae
FG
2642 boost::statechart::custom_reaction< RequestBackfill >
2643 > reactions;
2644 explicit Recovering(my_context ctx);
2645 void exit();
224ce89b 2646 void release_reservations(bool cancel = false);
7c673cae 2647 boost::statechart::result react(const AllReplicasRecovered &evt);
3efd9988 2648 boost::statechart::result react(const DeferRecovery& evt);
b32b8144 2649 boost::statechart::result react(const UnfoundRecovery& evt);
7c673cae
FG
2650 boost::statechart::result react(const RequestBackfill &evt);
2651 };
2652
2653 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2654 typedef boost::mpl::list <
2655 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2656 boost::statechart::transition< AllRemotesReserved, Recovering >
2657 > reactions;
2658 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2659 explicit WaitRemoteRecoveryReserved(my_context ctx);
2660 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2661 void exit();
2662 };
2663
2664 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2665 typedef boost::mpl::list <
2666 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2667 boost::statechart::custom_reaction< RecoveryTooFull >
2668 > reactions;
2669 explicit WaitLocalRecoveryReserved(my_context ctx);
2670 void exit();
2671 boost::statechart::result react(const RecoveryTooFull &evt);
2672 };
2673
2674 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2675 typedef boost::mpl::list <
2676 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2677 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2678 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2679 > reactions;
2680 explicit Activating(my_context ctx);
2681 void exit();
2682 };
2683
11fdf7f2
TL
2684 struct Stray : boost::statechart::state< Stray, Started >,
2685 NamedState {
7c673cae
FG
2686 explicit Stray(my_context ctx);
2687 void exit();
2688
2689 typedef boost::mpl::list <
2690 boost::statechart::custom_reaction< MQuery >,
2691 boost::statechart::custom_reaction< MLogRec >,
2692 boost::statechart::custom_reaction< MInfoRec >,
2693 boost::statechart::custom_reaction< ActMap >,
11fdf7f2
TL
2694 boost::statechart::custom_reaction< RecoveryDone >,
2695 boost::statechart::transition<DeleteStart, ToDelete>
7c673cae
FG
2696 > reactions;
2697 boost::statechart::result react(const MQuery& query);
2698 boost::statechart::result react(const MLogRec& logevt);
2699 boost::statechart::result react(const MInfoRec& infoevt);
2700 boost::statechart::result react(const ActMap&);
2701 boost::statechart::result react(const RecoveryDone&) {
2702 return discard_event();
2703 }
2704 };
2705
11fdf7f2
TL
2706 struct WaitDeleteReserved;
2707 struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2708 unsigned priority = 0;
2709 typedef boost::mpl::list <
2710 boost::statechart::custom_reaction< ActMap >,
2711 boost::statechart::custom_reaction< DeleteSome >
2712 > reactions;
2713 explicit ToDelete(my_context ctx);
2714 boost::statechart::result react(const ActMap &evt);
2715 boost::statechart::result react(const DeleteSome &evt) {
2716 // happens if we drop out of Deleting due to reprioritization etc.
2717 return discard_event();
2718 }
2719 void exit();
2720 };
2721
2722 struct Deleting;
2723 struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2724 ToDelete>, NamedState {
2725 typedef boost::mpl::list <
2726 boost::statechart::transition<DeleteReserved, Deleting>
2727 > reactions;
2728 explicit WaitDeleteReserved(my_context ctx);
2729 void exit();
2730 };
2731
2732 struct Deleting : boost::statechart::state<Deleting,
2733 ToDelete>, NamedState {
2734 typedef boost::mpl::list <
2735 boost::statechart::custom_reaction< DeleteSome >,
2736 boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2737 > reactions;
2738 explicit Deleting(my_context ctx);
2739 boost::statechart::result react(const DeleteSome &evt);
2740 void exit();
2741 };
2742
7c673cae
FG
2743 struct GetLog;
2744
2745 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2746 set<pg_shard_t> peer_info_requested;
2747
2748 explicit GetInfo(my_context ctx);
2749 void exit();
2750 void get_infos();
2751
2752 typedef boost::mpl::list <
2753 boost::statechart::custom_reaction< QueryState >,
2754 boost::statechart::transition< GotInfo, GetLog >,
2755 boost::statechart::custom_reaction< MNotifyRec >,
2756 boost::statechart::transition< IsDown, Down >
2757 > reactions;
2758 boost::statechart::result react(const QueryState& q);
2759 boost::statechart::result react(const MNotifyRec& infoevt);
2760 };
2761
2762 struct GotLog : boost::statechart::event< GotLog > {
2763 GotLog() : boost::statechart::event< GotLog >() {}
2764 };
2765
2766 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2767 pg_shard_t auth_log_shard;
2768 boost::intrusive_ptr<MOSDPGLog> msg;
2769
2770 explicit GetLog(my_context ctx);
2771 void exit();
2772
2773 typedef boost::mpl::list <
2774 boost::statechart::custom_reaction< QueryState >,
2775 boost::statechart::custom_reaction< MLogRec >,
2776 boost::statechart::custom_reaction< GotLog >,
2777 boost::statechart::custom_reaction< AdvMap >,
2778 boost::statechart::transition< IsIncomplete, Incomplete >
2779 > reactions;
2780 boost::statechart::result react(const AdvMap&);
2781 boost::statechart::result react(const QueryState& q);
2782 boost::statechart::result react(const MLogRec& logevt);
2783 boost::statechart::result react(const GotLog&);
2784 };
2785
2786 struct WaitUpThru;
2787
2788 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2789 set<pg_shard_t> peer_missing_requested;
2790
2791 explicit GetMissing(my_context ctx);
2792 void exit();
2793
2794 typedef boost::mpl::list <
2795 boost::statechart::custom_reaction< QueryState >,
2796 boost::statechart::custom_reaction< MLogRec >,
2797 boost::statechart::transition< NeedUpThru, WaitUpThru >
2798 > reactions;
2799 boost::statechart::result react(const QueryState& q);
2800 boost::statechart::result react(const MLogRec& logevt);
2801 };
2802
2803 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2804 explicit WaitUpThru(my_context ctx);
2805 void exit();
2806
2807 typedef boost::mpl::list <
2808 boost::statechart::custom_reaction< QueryState >,
2809 boost::statechart::custom_reaction< ActMap >,
2810 boost::statechart::custom_reaction< MLogRec >
2811 > reactions;
2812 boost::statechart::result react(const QueryState& q);
2813 boost::statechart::result react(const ActMap& am);
2814 boost::statechart::result react(const MLogRec& logrec);
2815 };
2816
2817 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2818 explicit Down(my_context ctx);
2819 typedef boost::mpl::list <
11fdf7f2
TL
2820 boost::statechart::custom_reaction< QueryState >,
2821 boost::statechart::custom_reaction< MNotifyRec >
7c673cae 2822 > reactions;
11fdf7f2
TL
2823 boost::statechart::result react(const QueryState& q);
2824 boost::statechart::result react(const MNotifyRec& infoevt);
7c673cae
FG
2825 void exit();
2826 };
2827
2828 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2829 typedef boost::mpl::list <
2830 boost::statechart::custom_reaction< AdvMap >,
2831 boost::statechart::custom_reaction< MNotifyRec >,
2832 boost::statechart::custom_reaction< QueryState >
2833 > reactions;
2834 explicit Incomplete(my_context ctx);
2835 boost::statechart::result react(const AdvMap &advmap);
2836 boost::statechart::result react(const MNotifyRec& infoevt);
11fdf7f2 2837 boost::statechart::result react(const QueryState& q);
7c673cae
FG
2838 void exit();
2839 };
2840
7c673cae
FG
2841 RecoveryMachine machine;
2842 PG *pg;
2843
2844 /// context passed in by state machine caller
2845 RecoveryCtx *orig_ctx;
2846
2847 /// populated if we are buffering messages pending a flush
2848 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2849
2850 /**
2851 * populated between start_handle() and end_handle(), points into
2852 * the message lists for messages_pending_flush while blocking messages
2853 * or into orig_ctx otherwise
2854 */
2855 boost::optional<RecoveryCtx> rctx;
2856
2857 public:
2858 explicit RecoveryState(PG *pg)
2859 : machine(this, pg), pg(pg), orig_ctx(0) {
2860 machine.initiate();
2861 }
2862
2863 void handle_event(const boost::statechart::event_base &evt,
2864 RecoveryCtx *rctx) {
2865 start_handle(rctx);
2866 machine.process_event(evt);
2867 end_handle();
2868 }
2869
11fdf7f2 2870 void handle_event(PGPeeringEventRef evt,
7c673cae
FG
2871 RecoveryCtx *rctx) {
2872 start_handle(rctx);
2873 machine.process_event(evt->get_event());
2874 end_handle();
2875 }
2876
2877 } recovery_state;
2878
2879
7c673cae 2880
7c673cae
FG
2881 uint64_t peer_features;
2882 uint64_t acting_features;
2883 uint64_t upacting_features;
2884
2885 epoch_t last_epoch;
2886
11fdf7f2
TL
2887 /// most recently consumed osdmap's require_osd_version
2888 unsigned last_require_osd_release = 0;
2889 bool delete_needs_sleep = false;
a8e16298 2890
11fdf7f2 2891protected:
7c673cae
FG
2892 void reset_min_peer_features() {
2893 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2894 }
2895 uint64_t get_min_peer_features() const { return peer_features; }
2896 void apply_peer_features(uint64_t f) { peer_features &= f; }
2897
2898 uint64_t get_min_acting_features() const { return acting_features; }
2899 uint64_t get_min_upacting_features() const { return upacting_features; }
c07f9fc5
FG
2900 bool perform_deletes_during_peering() const {
2901 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2902 }
7c673cae 2903
f64942e4
AA
2904 bool hard_limit_pglog() const {
2905 return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2906 }
2907
7c673cae
FG
2908 void init_primary_up_acting(
2909 const vector<int> &newup,
2910 const vector<int> &newacting,
2911 int new_up_primary,
2912 int new_acting_primary) {
2913 actingset.clear();
2914 acting = newacting;
2915 for (uint8_t i = 0; i < acting.size(); ++i) {
2916 if (acting[i] != CRUSH_ITEM_NONE)
2917 actingset.insert(
2918 pg_shard_t(
2919 acting[i],
11fdf7f2 2920 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
7c673cae
FG
2921 }
2922 upset.clear();
2923 up = newup;
2924 for (uint8_t i = 0; i < up.size(); ++i) {
2925 if (up[i] != CRUSH_ITEM_NONE)
2926 upset.insert(
2927 pg_shard_t(
2928 up[i],
11fdf7f2 2929 pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
7c673cae 2930 }
11fdf7f2 2931 if (!pool.info.is_erasure()) {
7c673cae
FG
2932 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2933 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2934 return;
2935 }
2936 up_primary = pg_shard_t();
2937 primary = pg_shard_t();
2938 for (uint8_t i = 0; i < up.size(); ++i) {
2939 if (up[i] == new_up_primary) {
2940 up_primary = pg_shard_t(up[i], shard_id_t(i));
2941 break;
2942 }
2943 }
2944 for (uint8_t i = 0; i < acting.size(); ++i) {
2945 if (acting[i] == new_acting_primary) {
2946 primary = pg_shard_t(acting[i], shard_id_t(i));
2947 break;
2948 }
2949 }
11fdf7f2
TL
2950 ceph_assert(up_primary.osd == new_up_primary);
2951 ceph_assert(primary.osd == new_acting_primary);
7c673cae 2952 }
7c673cae 2953
11fdf7f2
TL
2954 void set_role(int r) {
2955 role = r;
2956 }
7c673cae 2957
11fdf7f2
TL
2958 bool state_test(uint64_t m) const { return (state & m) != 0; }
2959 void state_set(uint64_t m) { state |= m; }
2960 void state_clear(uint64_t m) { state &= ~m; }
7c673cae
FG
2961
2962 bool is_complete() const { return info.last_complete == info.last_update; }
2963 bool should_send_notify() const { return send_notify; }
2964
11fdf7f2
TL
2965 uint64_t get_state() const { return state; }
2966 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2967 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2968 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2969 bool is_down() const { return state_test(PG_STATE_DOWN); }
2970 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2971 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2972 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2973 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2974 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2975 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2976 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2977 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2978 bool is_peered() const {
7c673cae
FG
2979 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2980 }
91327a77 2981 bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
11fdf7f2
TL
2982 bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2983 bool is_repair() const { return state_test(PG_STATE_REPAIR); }
7c673cae 2984
11fdf7f2 2985 bool is_empty() const { return info.last_update == eversion_t(0,0); }
7c673cae
FG
2986
2987 // pg on-disk state
2988 void do_pending_flush();
2989
11fdf7f2 2990public:
7c673cae
FG
2991 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2992 static void _init(ObjectStore::Transaction& t,
2993 spg_t pgid, const pg_pool_t *pool);
2994
11fdf7f2 2995protected:
7c673cae
FG
2996 void prepare_write_info(map<string,bufferlist> *km);
2997
2998 void update_store_with_options();
7c673cae
FG
2999
3000public:
3001 static int _prepare_write_info(
3002 CephContext* cct,
3003 map<string,bufferlist> *km,
3004 epoch_t epoch,
3005 pg_info_t &info,
3006 pg_info_t &last_written_info,
3007 PastIntervals &past_intervals,
3008 bool dirty_big_info,
3009 bool dirty_epoch,
3010 bool try_fast_info,
3011 PerfCounters *logger = nullptr);
11fdf7f2
TL
3012
3013 void write_if_dirty(RecoveryCtx *rctx) {
3014 write_if_dirty(*rctx->transaction);
3015 }
3016protected:
7c673cae
FG
3017 void write_if_dirty(ObjectStore::Transaction& t);
3018
3019 PGLog::IndexedLog projected_log;
3020 bool check_in_progress_op(
3021 const osd_reqid_t &r,
3022 eversion_t *version,
3023 version_t *user_version,
3024 int *return_code) const;
3025 eversion_t projected_last_update;
3026 eversion_t get_next_version() const {
3027 eversion_t at_version(
11fdf7f2 3028 get_osdmap_epoch(),
7c673cae 3029 projected_last_update.version+1);
11fdf7f2
TL
3030 ceph_assert(at_version > info.last_update);
3031 ceph_assert(at_version > pg_log.get_head());
3032 ceph_assert(at_version > projected_last_update);
7c673cae
FG
3033 return at_version;
3034 }
3035
3036 void add_log_entry(const pg_log_entry_t& e, bool applied);
3037 void append_log(
3038 const vector<pg_log_entry_t>& logv,
3039 eversion_t trim_to,
3040 eversion_t roll_forward_to,
3041 ObjectStore::Transaction &t,
11fdf7f2
TL
3042 bool transaction_applied = true,
3043 bool async = false);
7c673cae 3044 bool check_log_for_corruption(ObjectStore *store);
7c673cae
FG
3045
3046 std::string get_corrupt_pg_log_name() const;
11fdf7f2 3047
7c673cae
FG
3048 void update_snap_map(
3049 const vector<pg_log_entry_t> &log_entries,
3050 ObjectStore::Transaction& t);
3051
3052 void filter_snapc(vector<snapid_t> &snaps);
3053
3054 void log_weirdness();
3055
3056 virtual void kick_snap_trim() = 0;
3057 virtual void snap_trimmer_scrub_complete() = 0;
31f18b77 3058 bool requeue_scrub(bool high_priority = false);
c07f9fc5 3059 void queue_recovery();
7c673cae
FG
3060 bool queue_scrub();
3061 unsigned get_scrub_priority();
3062
3063 /// share pg info after a pg is active
3064 void share_pg_info();
3065
3066
3067 bool append_log_entries_update_missing(
31f18b77 3068 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
3069 ObjectStore::Transaction &t,
3070 boost::optional<eversion_t> trim_to,
3071 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
3072
3073 /**
3074 * Merge entries updating missing as necessary on all
11fdf7f2 3075 * acting_recovery_backfill logs and missings (also missing_loc)
7c673cae
FG
3076 */
3077 void merge_new_log_entries(
31f18b77 3078 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
3079 ObjectStore::Transaction &t,
3080 boost::optional<eversion_t> trim_to,
3081 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
3082
3083 void reset_interval_flush();
3084 void start_peering_interval(
3085 const OSDMapRef lastmap,
3086 const vector<int>& newup, int up_primary,
3087 const vector<int>& newacting, int acting_primary,
3088 ObjectStore::Transaction *t);
3089 void on_new_interval();
3090 virtual void _on_new_interval() = 0;
11fdf7f2 3091 void start_flush(ObjectStore::Transaction *t);
7c673cae 3092 void set_last_peering_reset();
7c673cae
FG
3093
3094 void update_history(const pg_history_t& history);
3095 void fulfill_info(pg_shard_t from, const pg_query_t &query,
3096 pair<pg_shard_t, pg_info_t> &notify_info);
3097 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
1adf2230 3098 void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
7c673cae
FG
3099 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3100
3101 bool should_restart_peering(
3102 int newupprimary,
3103 int newactingprimary,
3104 const vector<int>& newup,
3105 const vector<int>& newacting,
3106 OSDMapRef lastmap,
3107 OSDMapRef osdmap);
3108
3109 // OpRequest queueing
3110 bool can_discard_op(OpRequestRef& op);
3111 bool can_discard_scan(OpRequestRef op);
3112 bool can_discard_backfill(OpRequestRef op);
3113 bool can_discard_request(OpRequestRef& op);
3114
3115 template<typename T, int MSGTYPE>
3116 bool can_discard_replica_op(OpRequestRef& op);
3117
3118 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
11fdf7f2 3119 bool old_peering_evt(PGPeeringEventRef evt) {
7c673cae
FG
3120 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3121 }
3122 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3123 return e <= cur_epoch;
3124 }
3125 bool have_same_or_newer_map(epoch_t e) {
11fdf7f2 3126 return e <= get_osdmap_epoch();
7c673cae
FG
3127 }
3128
3129 bool op_has_sufficient_caps(OpRequestRef& op);
3130
3131
3132 // recovery bits
3133 void take_waiters();
7c673cae
FG
3134
3135
3136 // abstract bits
11fdf7f2 3137 friend class FlushState;
7c673cae
FG
3138
3139 virtual void on_role_change() = 0;
3140 virtual void on_pool_change() = 0;
3141 virtual void on_change(ObjectStore::Transaction *t) = 0;
3142 virtual void on_activate() = 0;
3143 virtual void on_flushed() = 0;
7c673cae 3144 virtual void check_blacklisted_watchers() = 0;
7c673cae 3145
11fdf7f2 3146 friend ostream& operator<<(ostream& out, const PG& pg);
7c673cae
FG
3147};
3148
7c673cae
FG
3149
3150ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3151
3152#endif