]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.h
import 15.2.0 Octopus source
[ceph.git] / ceph / src / osd / PG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PG_H
16#define CEPH_PG_H
17
18#include <boost/statechart/custom_reaction.hpp>
19#include <boost/statechart/event.hpp>
20#include <boost/statechart/simple_state.hpp>
21#include <boost/statechart/state.hpp>
22#include <boost/statechart/state_machine.hpp>
23#include <boost/statechart/transition.hpp>
24#include <boost/statechart/event_base.hpp>
25#include <boost/scoped_ptr.hpp>
91327a77 26#include <boost/container/flat_set.hpp>
7c673cae
FG
27#include "include/mempool.h"
28
29// re-include our assert to clobber boost's
11fdf7f2 30#include "include/ceph_assert.h"
9f95a23c 31#include "include/common_fwd.h"
7c673cae
FG
32
33#include "include/types.h"
34#include "include/stringify.h"
35#include "osd_types.h"
36#include "include/xlist.h"
37#include "SnapMapper.h"
38#include "Session.h"
39#include "common/Timer.h"
40
41#include "PGLog.h"
42#include "OSDMap.h"
43#include "messages/MOSDPGLog.h"
44#include "include/str_list.h"
45#include "PGBackend.h"
11fdf7f2 46#include "PGPeeringEvent.h"
9f95a23c
TL
47#include "PeeringState.h"
48#include "MissingLoc.h"
11fdf7f2
TL
49
50#include "mgr/OSDPerfMetricTypes.h"
7c673cae
FG
51
52#include <atomic>
53#include <list>
54#include <memory>
7c673cae
FG
55#include <string>
56#include <tuple>
7c673cae
FG
57
58//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
11fdf7f2 59//#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks
7c673cae
FG
60
61class OSD;
62class OSDService;
11fdf7f2
TL
63class OSDShard;
64class OSDShardPGSlot;
7c673cae
FG
65class MOSDOp;
66class MOSDPGScan;
67class MOSDPGBackfill;
68class MOSDPGInfo;
69
70class PG;
71struct OpRequest;
72typedef OpRequest::Ref OpRequestRef;
73class MOSDPGLog;
11fdf7f2 74class DynamicPerfStats;
7c673cae
FG
75
76namespace Scrub {
77 class Store;
78}
79
7c673cae
FG
80#ifdef PG_DEBUG_REFS
81#include "common/tracked_int_ptr.hpp"
82 uint64_t get_with_id(PG *pg);
83 void put_with_id(PG *pg, uint64_t id);
84 typedef TrackedIntPtr<PG> PGRef;
85#else
86 typedef boost::intrusive_ptr<PG> PGRef;
87#endif
88
89class PGRecoveryStats {
90 struct per_state_info {
91 uint64_t enter, exit; // enter/exit counts
92 uint64_t events;
93 utime_t event_time; // time spent processing events
94 utime_t total_time; // total time in state
95 utime_t min_time, max_time;
96
97 // cppcheck-suppress unreachableCode
98 per_state_info() : enter(0), exit(0), events(0) {}
99 };
100 map<const char *,per_state_info> info;
9f95a23c 101 ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock");
7c673cae
FG
102
103 public:
9f95a23c 104 PGRecoveryStats() = default;
7c673cae
FG
105
106 void reset() {
11fdf7f2 107 std::lock_guard l(lock);
7c673cae
FG
108 info.clear();
109 }
110 void dump(ostream& out) {
11fdf7f2 111 std::lock_guard l(lock);
7c673cae
FG
112 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
113 per_state_info& i = p->second;
114 out << i.enter << "\t" << i.exit << "\t"
115 << i.events << "\t" << i.event_time << "\t"
116 << i.total_time << "\t"
117 << i.min_time << "\t" << i.max_time << "\t"
118 << p->first << "\n";
119 }
120 }
121
122 void dump_formatted(Formatter *f) {
11fdf7f2 123 std::lock_guard l(lock);
7c673cae
FG
124 f->open_array_section("pg_recovery_stats");
125 for (map<const char *,per_state_info>::iterator p = info.begin();
126 p != info.end(); ++p) {
127 per_state_info& i = p->second;
128 f->open_object_section("recovery_state");
129 f->dump_int("enter", i.enter);
130 f->dump_int("exit", i.exit);
131 f->dump_int("events", i.events);
132 f->dump_stream("event_time") << i.event_time;
133 f->dump_stream("total_time") << i.total_time;
134 f->dump_stream("min_time") << i.min_time;
135 f->dump_stream("max_time") << i.max_time;
136 vector<string> states;
137 get_str_vec(p->first, "/", states);
138 f->open_array_section("nested_states");
139 for (vector<string>::iterator st = states.begin();
140 st != states.end(); ++st) {
141 f->dump_string("state", *st);
142 }
143 f->close_section();
144 f->close_section();
145 }
146 f->close_section();
147 }
148
149 void log_enter(const char *s) {
11fdf7f2 150 std::lock_guard l(lock);
7c673cae
FG
151 info[s].enter++;
152 }
153 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
11fdf7f2 154 std::lock_guard l(lock);
7c673cae
FG
155 per_state_info &i = info[s];
156 i.exit++;
157 i.total_time += dur;
158 if (dur > i.max_time)
159 i.max_time = dur;
160 if (dur < i.min_time || i.min_time == utime_t())
161 i.min_time = dur;
162 i.events += events;
163 i.event_time += event_dur;
164 }
165};
166
7c673cae
FG
167/** PG - Replica Placement Group
168 *
169 */
170
9f95a23c
TL
171class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
172 friend class NamedState;
173 friend class PeeringState;
174
a8e16298 175public:
9f95a23c 176 const pg_shard_t pg_whoami;
11fdf7f2 177 const spg_t pg_id;
9f95a23c
TL
178
179public:
180 // -- members --
11fdf7f2
TL
181 const coll_t coll;
182
183 ObjectStore::CollectionHandle ch;
184
11fdf7f2
TL
185 // -- methods --
186 std::ostream& gen_prefix(std::ostream& out) const override;
187 CephContext *get_cct() const override {
188 return cct;
189 }
190 unsigned get_subsys() const override {
191 return ceph_subsys_osd;
192 }
193
9f95a23c
TL
194 const char* const get_current_state() const {
195 return recovery_state.get_current_state();
196 }
197
11fdf7f2
TL
198 const OSDMapRef& get_osdmap() const {
199 ceph_assert(is_locked());
9f95a23c 200 return recovery_state.get_osdmap();
11fdf7f2 201 }
9f95a23c
TL
202
203 epoch_t get_osdmap_epoch() const override final {
204 return recovery_state.get_osdmap()->get_epoch();
11fdf7f2
TL
205 }
206
9f95a23c
TL
207 PerfCounters &get_peering_perf() override;
208 PerfCounters &get_perf_logger() override;
209 void log_state_enter(const char *state) override;
210 void log_state_exit(
211 const char *state_name, utime_t enter_time,
212 uint64_t events, utime_t event_dur) override;
213
11fdf7f2
TL
214 void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
215 handle.suspend_tp_timeout();
216 lock();
217 handle.reset_tp_timeout();
218 }
219 void lock(bool no_lockdep = false) const;
9f95a23c
TL
220 void unlock() const;
221 bool is_locked() const;
11fdf7f2
TL
222
223 const spg_t& get_pgid() const {
224 return pg_id;
225 }
226
227 const PGPool& get_pool() const {
228 return pool;
229 }
230 uint64_t get_last_user_version() const {
231 return info.last_user_version;
232 }
233 const pg_history_t& get_history() const {
234 return info.history;
235 }
236 bool get_need_up_thru() const {
9f95a23c 237 return recovery_state.get_need_up_thru();
11fdf7f2
TL
238 }
239 epoch_t get_same_interval_since() const {
240 return info.history.same_interval_since;
241 }
242
9f95a23c
TL
243 static void set_last_scrub_stamp(
244 utime_t t, pg_history_t &history, pg_stat_t &stats) {
245 stats.last_scrub_stamp = t;
246 history.last_scrub_stamp = t;
247 }
248
11fdf7f2 249 void set_last_scrub_stamp(utime_t t) {
9f95a23c
TL
250 recovery_state.update_stats(
251 [=](auto &history, auto &stats) {
252 set_last_scrub_stamp(t, history, stats);
253 return true;
254 });
255 }
256
257 static void set_last_deep_scrub_stamp(
258 utime_t t, pg_history_t &history, pg_stat_t &stats) {
259 stats.last_deep_scrub_stamp = t;
260 history.last_deep_scrub_stamp = t;
11fdf7f2
TL
261 }
262
263 void set_last_deep_scrub_stamp(utime_t t) {
9f95a23c
TL
264 recovery_state.update_stats(
265 [=](auto &history, auto &stats) {
266 set_last_deep_scrub_stamp(t, history, stats);
267 return true;
268 });
11fdf7f2
TL
269 }
270
271 bool is_deleting() const {
9f95a23c 272 return recovery_state.is_deleting();
11fdf7f2
TL
273 }
274 bool is_deleted() const {
9f95a23c 275 return recovery_state.is_deleted();
11fdf7f2 276 }
9f95a23c
TL
277 bool is_nonprimary() const {
278 return recovery_state.is_nonprimary();
11fdf7f2
TL
279 }
280 bool is_primary() const {
9f95a23c 281 return recovery_state.is_primary();
11fdf7f2
TL
282 }
283 bool pg_has_reset_since(epoch_t e) {
284 ceph_assert(is_locked());
9f95a23c 285 return recovery_state.pg_has_reset_since(e);
11fdf7f2
TL
286 }
287
288 bool is_ec_pg() const {
9f95a23c 289 return recovery_state.is_ec_pg();
11fdf7f2
TL
290 }
291 int get_role() const {
9f95a23c 292 return recovery_state.get_role();
11fdf7f2
TL
293 }
294 const vector<int> get_acting() const {
9f95a23c
TL
295 return recovery_state.get_acting();
296 }
297 const set<pg_shard_t> &get_actingset() const {
298 return recovery_state.get_actingset();
11fdf7f2
TL
299 }
300 int get_acting_primary() const {
9f95a23c 301 return recovery_state.get_acting_primary();
11fdf7f2
TL
302 }
303 pg_shard_t get_primary() const {
9f95a23c 304 return recovery_state.get_primary();
11fdf7f2
TL
305 }
306 const vector<int> get_up() const {
9f95a23c 307 return recovery_state.get_up();
11fdf7f2
TL
308 }
309 int get_up_primary() const {
9f95a23c 310 return recovery_state.get_up_primary();
11fdf7f2
TL
311 }
312 const PastIntervals& get_past_intervals() const {
9f95a23c
TL
313 return recovery_state.get_past_intervals();
314 }
315 bool is_acting_recovery_backfill(pg_shard_t osd) const {
316 return recovery_state.is_acting_recovery_backfill(osd);
317 }
318 const set<pg_shard_t> &get_acting_recovery_backfill() const {
319 return recovery_state.get_acting_recovery_backfill();
320 }
321 bool is_acting(pg_shard_t osd) const {
322 return recovery_state.is_acting(osd);
323 }
324 bool is_up(pg_shard_t osd) const {
325 return recovery_state.is_up(osd);
326 }
327 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
328 return PeeringState::has_shard(ec, v, osd);
11fdf7f2
TL
329 }
330
331 /// initialize created PG
332 void init(
333 int role,
334 const vector<int>& up,
335 int up_primary,
336 const vector<int>& acting,
337 int acting_primary,
338 const pg_history_t& history,
339 const PastIntervals& pim,
340 bool backfill,
9f95a23c 341 ObjectStore::Transaction &t);
11fdf7f2
TL
342
343 /// read existing pg state off disk
344 void read_state(ObjectStore *store);
345 static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
346
347 static int get_latest_struct_v() {
9f95a23c 348 return pg_latest_struct_v;
11fdf7f2
TL
349 }
350 static int get_compat_struct_v() {
9f95a23c 351 return pg_compat_struct_v;
11fdf7f2
TL
352 }
353 static int read_info(
354 ObjectStore *store, spg_t pgid, const coll_t &coll,
355 pg_info_t &info, PastIntervals &past_intervals,
356 __u8 &);
357 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
358
9f95a23c 359 void rm_backoff(const ceph::ref_t<Backoff>& b);
11fdf7f2
TL
360
361 void update_snap_mapper_bits(uint32_t bits) {
362 snap_mapper.update_bits(bits);
363 }
364 void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
365 virtual void split_colls(
366 spg_t child,
367 int split_bits,
368 int seed,
369 const pg_pool_t *pool,
9f95a23c 370 ObjectStore::Transaction &t) = 0;
11fdf7f2 371 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
9f95a23c 372 void merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
11fdf7f2
TL
373 unsigned split_bits,
374 const pg_merge_meta_t& last_pg_merge_meta);
9f95a23c
TL
375 void finish_split_stats(const object_stat_sum_t& stats,
376 ObjectStore::Transaction &t);
11fdf7f2
TL
377
378 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
494da23a
TL
379
380 bool is_scrub_registered();
11fdf7f2
TL
381 void reg_next_scrub();
382 void unreg_next_scrub();
383
9f95a23c
TL
384 void queue_want_pg_temp(const vector<int> &wanted) override;
385 void clear_want_pg_temp() override;
386
387 void on_new_interval() override;
388
389 void on_role_change() override;
390 virtual void plpg_on_role_change() = 0;
391
392 void init_collection_pool_opts();
393 void on_pool_change() override;
394 virtual void plpg_on_pool_change() = 0;
395
396 void on_info_history_change() override;
397
398 void scrub_requested(bool deep, bool repair, bool need_auto = false) override;
399
400 uint64_t get_snap_trimq_size() const override {
401 return snap_trimq.size();
402 }
403 unsigned get_target_pg_log_entries() const override;
404
405 void clear_publish_stats() override;
406 void clear_primary_state() override;
494da23a 407
9f95a23c
TL
408 epoch_t oldest_stored_osdmap() override;
409 OstreamTemp get_clog_error() override;
410 OstreamTemp get_clog_info() override;
411 OstreamTemp get_clog_debug() override;
412
413 void schedule_event_after(
414 PGPeeringEventRef event,
415 float delay) override;
416 void request_local_background_io_reservation(
417 unsigned priority,
418 PGPeeringEventRef on_grant,
419 PGPeeringEventRef on_preempt) override;
420 void update_local_background_io_priority(
421 unsigned priority) override;
422 void cancel_local_background_io_reservation() override;
423
424 void request_remote_recovery_reservation(
425 unsigned priority,
426 PGPeeringEventRef on_grant,
427 PGPeeringEventRef on_preempt) override;
428 void cancel_remote_recovery_reservation() override;
429
430 void schedule_event_on_commit(
431 ObjectStore::Transaction &t,
432 PGPeeringEventRef on_commit) override;
433
434 void on_active_exit() override;
435
436 Context *on_clean() override {
437 if (is_active()) {
438 kick_snap_trim();
439 }
440 requeue_ops(waiting_for_clean_to_primary_repair);
441 return finish_recovery();
442 }
443
444 void on_activate(interval_set<snapid_t> snaps) override {
445 ceph_assert(scrubber.callbacks.empty());
446 ceph_assert(callbacks_for_degraded_object.empty());
447 snap_trimq = snaps;
448 release_pg_backoffs();
449 projected_last_update = info.last_update;
450 }
451
452 void on_activate_committed() override;
453
454 void on_active_actmap() override;
455 void on_active_advmap(const OSDMapRef &osdmap) override;
456
457 void queue_snap_retrim(snapid_t snap);
458
459 void on_backfill_reserved() override;
460 void on_backfill_canceled() override;
461 void on_recovery_reserved() override;
494da23a 462
11fdf7f2 463 bool is_forced_recovery_or_backfill() const {
9f95a23c
TL
464 return recovery_state.is_forced_recovery_or_backfill();
465 }
466
467 PGLog::LogEntryHandlerRef get_log_handler(
468 ObjectStore::Transaction &t) override {
469 return std::make_unique<PG::PGLogEntryHandler>(this, &t);
11fdf7f2 470 }
9f95a23c
TL
471
472 void do_delete_work(ObjectStore::Transaction &t) override;
473
474 void clear_ready_to_merge() override;
475 void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override;
476 void set_not_ready_to_merge_source(pg_t pgid) override;
477 void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override;
478 void set_ready_to_merge_source(eversion_t lu) override;
479
480 void send_pg_created(pg_t pgid) override;
481
482 ceph::signedspan get_mnow() override;
483 HeartbeatStampsRef get_hb_stamps(int peer) override;
484 void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override;
485 void queue_check_readable(epoch_t lpr, ceph::timespan delay) override;
486
487 void rebuild_missing_set_with_deletes(PGLog &pglog) override;
a8e16298 488
11fdf7f2 489 void queue_peering_event(PGPeeringEventRef evt);
9f95a23c 490 void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx);
11fdf7f2
TL
491 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
492 void queue_flushed(epoch_t started_at);
493 void handle_advance_map(
494 OSDMapRef osdmap, OSDMapRef lastmap,
495 vector<int>& newup, int up_primary,
496 vector<int>& newacting, int acting_primary,
9f95a23c
TL
497 PeeringCtx &rctx);
498 void handle_activate_map(PeeringCtx &rctx);
499 void handle_initialize(PeeringCtx &rxcx);
11fdf7f2
TL
500 void handle_query_state(Formatter *f);
501
502 /**
503 * @param ops_begun returns how many recovery ops the function started
504 * @returns true if any useful work was accomplished; false otherwise
505 */
506 virtual bool start_recovery_ops(
507 uint64_t max,
508 ThreadPool::TPHandle &handle,
509 uint64_t *ops_begun) = 0;
510
9f95a23c
TL
511 // more work after the above, but with a PeeringCtx
512 void find_unfound(epoch_t queued, PeeringCtx &rctx);
11fdf7f2
TL
513
514 virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
515
516 void dump_pgstate_history(Formatter *f);
517 void dump_missing(Formatter *f);
518
519 void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
520 void with_heartbeat_peers(std::function<void(int)> f);
521
522 void shutdown();
523 virtual void on_shutdown() = 0;
524
525 bool get_must_scrub() const {
526 return scrubber.must_scrub;
527 }
528 bool sched_scrub();
529
530 virtual void do_request(
531 OpRequestRef& op,
532 ThreadPool::TPHandle &handle
533 ) = 0;
534 virtual void clear_cache() = 0;
535 virtual int get_cache_obj_count() = 0;
536
537 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
9f95a23c
TL
538 virtual void do_command(
539 const string_view& prefix,
540 const cmdmap_t& cmdmap,
541 const bufferlist& idata,
542 std::function<void(int,const std::string&,bufferlist&)> on_finish) = 0;
11fdf7f2
TL
543
544 virtual bool agent_work(int max) = 0;
545 virtual bool agent_work(int max, int agent_flush_quota) = 0;
546 virtual void agent_stop() = 0;
547 virtual void agent_delay() = 0;
548 virtual void agent_clear() = 0;
549 virtual void agent_choose_mode_restart() = 0;
550
9f95a23c
TL
551 struct C_DeleteMore : public Context {
552 PGRef pg;
553 epoch_t epoch;
554 C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
555 void finish(int r) override {
556 ceph_abort();
557 }
558 void complete(int r) override;
559 };
11fdf7f2
TL
560
561 void _delete_some(ObjectStore::Transaction *t);
562
563 virtual void set_dynamic_perf_stats_queries(
564 const std::list<OSDPerfMetricQuery> &queries) {
565 }
566 virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
567 }
568
9f95a23c
TL
569 uint64_t get_min_alloc_size() const;
570
11fdf7f2
TL
571 // reference counting
572#ifdef PG_DEBUG_REFS
573 uint64_t get_with_id();
574 void put_with_id(uint64_t);
575 void dump_live_ids();
576#endif
577 void get(const char* tag);
578 void put(const char* tag);
579 int get_num_ref() {
580 return ref;
581 }
582
583 // ctor
584 PG(OSDService *o, OSDMapRef curmap,
585 const PGPool &pool, spg_t p);
586 ~PG() override;
587
588 // prevent copying
589 explicit PG(const PG& rhs) = delete;
590 PG& operator=(const PG& rhs) = delete;
591
7c673cae 592protected:
11fdf7f2
TL
593 // -------------
594 // protected
7c673cae 595 OSDService *osd;
11fdf7f2
TL
596public:
597 OSDShard *osd_shard = nullptr;
598 OSDShardPGSlot *pg_slot = nullptr;
599protected:
7c673cae 600 CephContext *cct;
11fdf7f2 601
11fdf7f2
TL
602 // locking and reference counting.
603 // I destroy myself when the reference count hits zero.
604 // lock() should be called before doing anything.
605 // get() should be called on pointer copy (to another thread, etc.).
606 // put() should be called on destruction of some previously copied pointer.
607 // unlock() when done with the current pointer (_most common_).
9f95a23c
TL
608 mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock");
609#ifndef CEPH_DEBUG_MUTEX
610 mutable std::thread::id locked_by;
611#endif
11fdf7f2
TL
612 std::atomic<unsigned int> ref{0};
613
614#ifdef PG_DEBUG_REFS
9f95a23c 615 ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock");
11fdf7f2
TL
616 map<uint64_t, string> _live_ids;
617 map<string, uint64_t> _tag_counts;
618 uint64_t _ref_id = 0;
619
620 friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
621 friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
622#endif
623
624private:
625 friend void intrusive_ptr_add_ref(PG *pg) {
626 pg->get("intptr");
627 }
628 friend void intrusive_ptr_release(PG *pg) {
629 pg->put("intptr");
630 }
631
632
633 // =====================
634
635protected:
7c673cae
FG
636 OSDriver osdriver;
637 SnapMapper snap_mapper;
224ce89b 638 bool eio_errors_to_process = false;
7c673cae
FG
639
640 virtual PGBackend *get_pgbackend() = 0;
11fdf7f2 641 virtual const PGBackend* get_pgbackend() const = 0;
7c673cae 642
11fdf7f2 643protected:
7c673cae
FG
644 void requeue_map_waiters();
645
7c673cae
FG
646protected:
647
7c673cae
FG
648 ZTracer::Endpoint trace_endpoint;
649
7c673cae 650
11fdf7f2 651protected:
11fdf7f2 652 __u8 info_struct_v = 0;
7c673cae
FG
653 void upgrade(ObjectStore *store);
654
11fdf7f2 655protected:
7c673cae
FG
656 ghobject_t pgmeta_oid;
657
91327a77 658 // ------------------
7c673cae 659 interval_set<snapid_t> snap_trimq;
9f95a23c 660 set<snapid_t> snap_trimq_repeat;
7c673cae
FG
661
662 /* You should not use these items without taking their respective queue locks
663 * (if they have one) */
664 xlist<PG*>::item stat_queue_item;
665 bool scrub_queued;
666 bool recovery_queued;
667
668 int recovery_ops_active;
669 set<pg_shard_t> waiting_on_backfill;
670#ifdef DEBUG_RECOVERY_OIDS
11fdf7f2 671 multiset<hobject_t> recovering_oids;
7c673cae
FG
672#endif
673
94b18763
FG
674public:
675 bool dne() { return info.dne(); }
7c673cae 676
9f95a23c
TL
677 virtual void send_cluster_message(
678 int osd, Message *m, epoch_t epoch, bool share_map_update) override;
7c673cae
FG
679
680protected:
11fdf7f2 681 epoch_t get_last_peering_reset() const {
9f95a23c 682 return recovery_state.get_last_peering_reset();
11fdf7f2 683 }
7c673cae
FG
684
685 /* heartbeat peers */
9f95a23c
TL
686 void set_probe_targets(const set<pg_shard_t> &probe_set) override;
687 void clear_probe_targets() override;
11fdf7f2 688
9f95a23c
TL
689 ceph::mutex heartbeat_peer_lock =
690 ceph::make_mutex("PG::heartbeat_peer_lock");
7c673cae
FG
691 set<int> heartbeat_peers;
692 set<int> probe_targets;
693
11fdf7f2 694public:
7c673cae
FG
695 /**
696 * BackfillInterval
697 *
698 * Represents the objects in a range [begin, end)
699 *
700 * Possible states:
701 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
702 * 2) Else, objects contains all objects in [begin, end)
703 */
704 struct BackfillInterval {
705 // info about a backfill interval on a peer
706 eversion_t version; /// version at which the scan occurred
707 map<hobject_t,eversion_t> objects;
708 hobject_t begin;
709 hobject_t end;
710
711 /// clear content
712 void clear() {
713 *this = BackfillInterval();
714 }
715
716 /// clear objects list only
717 void clear_objects() {
718 objects.clear();
719 }
720
721 /// reinstantiate with a new start+end position and sort order
722 void reset(hobject_t start) {
723 clear();
724 begin = end = start;
725 }
726
727 /// true if there are no objects in this interval
728 bool empty() const {
729 return objects.empty();
730 }
731
732 /// true if interval extends to the end of the range
733 bool extends_to_end() const {
734 return end.is_max();
735 }
736
737 /// removes items <= soid and adjusts begin to the first object
738 void trim_to(const hobject_t &soid) {
739 trim();
740 while (!objects.empty() &&
741 objects.begin()->first <= soid) {
742 pop_front();
743 }
744 }
745
746 /// Adjusts begin to the first object
747 void trim() {
748 if (!objects.empty())
749 begin = objects.begin()->first;
750 else
751 begin = end;
752 }
753
754 /// drop first entry, and adjust @begin accordingly
755 void pop_front() {
11fdf7f2 756 ceph_assert(!objects.empty());
7c673cae
FG
757 objects.erase(objects.begin());
758 trim();
759 }
760
761 /// dump
762 void dump(Formatter *f) const {
763 f->dump_stream("begin") << begin;
764 f->dump_stream("end") << end;
765 f->open_array_section("objects");
766 for (map<hobject_t, eversion_t>::const_iterator i =
767 objects.begin();
768 i != objects.end();
769 ++i) {
770 f->open_object_section("object");
771 f->dump_stream("object") << i->first;
772 f->dump_stream("version") << i->second;
773 f->close_section();
774 }
775 f->close_section();
776 }
777 };
778
779protected:
780 BackfillInterval backfill_info;
781 map<pg_shard_t, BackfillInterval> peer_backfill_info;
7c673cae
FG
782 bool backfill_reserving;
783
11fdf7f2
TL
784 // The primary's num_bytes and local num_bytes for this pg, only valid
785 // during backfill for non-primary shards.
786 // Both of these are adjusted for EC to reflect the on-disk bytes
787 std::atomic<int64_t> primary_num_bytes = 0;
788 std::atomic<int64_t> local_num_bytes = 0;
7c673cae 789
11fdf7f2 790public:
11fdf7f2
TL
791 // Space reserved for backfill is primary_num_bytes - local_num_bytes
792 // Don't care that difference itself isn't atomic
793 uint64_t get_reserved_num_bytes() {
794 int64_t primary = primary_num_bytes.load();
795 int64_t local = local_num_bytes.load();
796 if (primary > local)
797 return primary - local;
798 else
799 return 0;
800 }
801
802 bool is_remote_backfilling() {
803 return primary_num_bytes.load() > 0;
804 }
805
9f95a23c
TL
806 bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
807 void unreserve_recovery_space() override;
11fdf7f2
TL
808
809 // If num_bytes are inconsistent and local_num- goes negative
810 // it's ok, because it would then be ignored.
811
812 // The value of num_bytes could be negative,
813 // but we don't let local_num_bytes go negative.
814 void add_local_num_bytes(int64_t num_bytes) {
815 if (num_bytes) {
816 int64_t prev_bytes = local_num_bytes.load();
817 int64_t new_bytes;
818 do {
819 new_bytes = prev_bytes + num_bytes;
820 if (new_bytes < 0)
821 new_bytes = 0;
822 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
823 }
824 }
825 void sub_local_num_bytes(int64_t num_bytes) {
826 ceph_assert(num_bytes >= 0);
827 if (num_bytes) {
828 int64_t prev_bytes = local_num_bytes.load();
829 int64_t new_bytes;
830 do {
831 new_bytes = prev_bytes - num_bytes;
832 if (new_bytes < 0)
833 new_bytes = 0;
834 } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
835 }
836 }
837 // The value of num_bytes could be negative,
838 // but we don't let info.stats.stats.sum.num_bytes go negative.
839 void add_num_bytes(int64_t num_bytes) {
9f95a23c 840 ceph_assert(ceph_mutex_is_locked_by_me(_lock));
11fdf7f2 841 if (num_bytes) {
9f95a23c
TL
842 recovery_state.update_stats(
843 [num_bytes](auto &history, auto &stats) {
844 stats.stats.sum.num_bytes += num_bytes;
845 if (stats.stats.sum.num_bytes < 0) {
846 stats.stats.sum.num_bytes = 0;
847 }
848 return false;
849 });
11fdf7f2
TL
850 }
851 }
852 void sub_num_bytes(int64_t num_bytes) {
9f95a23c 853 ceph_assert(ceph_mutex_is_locked_by_me(_lock));
11fdf7f2
TL
854 ceph_assert(num_bytes >= 0);
855 if (num_bytes) {
9f95a23c
TL
856 recovery_state.update_stats(
857 [num_bytes](auto &history, auto &stats) {
858 stats.stats.sum.num_bytes -= num_bytes;
859 if (stats.stats.sum.num_bytes < 0) {
860 stats.stats.sum.num_bytes = 0;
861 }
862 return false;
863 });
11fdf7f2
TL
864 }
865 }
866
867 // Only used in testing so not worried about needing the PG lock here
868 int64_t get_stats_num_bytes() {
9f95a23c 869 std::lock_guard l{_lock};
11fdf7f2
TL
870 int num_bytes = info.stats.stats.sum.num_bytes;
871 if (pool.info.is_erasure()) {
872 num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
873 // Round up each object by a stripe
874 num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
875 }
876 int64_t lnb = local_num_bytes.load();
877 if (lnb && lnb != num_bytes) {
878 lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
879 << lnb << " vs stats "
880 << info.stats.stats.sum.num_bytes << " / chunk "
881 << get_pgbackend()->get_ec_data_chunk_count()
882 << dendl;
883 }
884 return num_bytes;
885 }
886
7c673cae
FG
887protected:
888
889 /*
890 * blocked request wait hierarchy
891 *
892 * In order to preserve request ordering we need to be careful about the
893 * order in which blocked requests get requeued. Generally speaking, we
894 * push the requests back up to the op_wq in reverse order (most recent
895 * request first) so that they come back out again in the original order.
896 * However, because there are multiple wait queues, we need to requeue
897 * waitlists in order. Generally speaking, we requeue the wait lists
898 * that are checked first.
899 *
900 * Here are the various wait lists, in the order they are used during
901 * request processing, with notes:
902 *
903 * - waiting_for_map
904 * - may start or stop blocking at any time (depending on client epoch)
905 * - waiting_for_peered
9f95a23c 906 * - !is_peered()
7c673cae 907 * - only starts blocking on interval change; never restarts
9f95a23c
TL
908 * - waiting_for_flush
909 * - flushes_in_progress
910 * - waiting for final flush during activate
7c673cae
FG
911 * - waiting_for_active
912 * - !is_active()
913 * - only starts blocking on interval change; never restarts
9f95a23c
TL
914 * - waiting_for_readable
915 * - now > readable_until
916 * - unblocks when we get fresh(er) osd_pings
7c673cae
FG
917 * - waiting_for_scrub
918 * - starts and stops blocking for varying intervals during scrub
919 * - waiting_for_unreadable_object
920 * - never restarts once object is readable (* except for EIO?)
921 * - waiting_for_degraded_object
922 * - never restarts once object is writeable (* except for EIO?)
923 * - waiting_for_blocked_object
924 * - starts and stops based on proxied op activity
925 * - obc rwlocks
926 * - starts and stops based on read/write activity
927 *
928 * Notes:
929 *
930 * 1. During and interval change, we requeue *everything* in the above order.
931 *
932 * 2. When an obc rwlock is released, we check for a scrub block and requeue
933 * the op there if it applies. We ignore the unreadable/degraded/blocked
934 * queues because we assume they cannot apply at that time (this is
935 * probably mostly true).
936 *
937 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
938 * it is non-empty.
939 *
940 * These three behaviors are generally sufficient to maintain ordering, with
941 * the possible exception of cases where we make an object degraded or
942 * unreadable that was previously okay, e.g. when scrub or op processing
943 * encounter an unexpected error. FIXME.
944 */
945
7c673cae
FG
946 // ops with newer maps than our (or blocked behind them)
947 // track these by client, since inter-request ordering doesn't otherwise
948 // matter.
949 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
950
951 // ops waiting on peered
952 list<OpRequestRef> waiting_for_peered;
953
9f95a23c
TL
954 /// ops waiting on readble
955 list<OpRequestRef> waiting_for_readable;
956
7c673cae
FG
957 // ops waiting on active (require peered as well)
958 list<OpRequestRef> waiting_for_active;
b32b8144 959 list<OpRequestRef> waiting_for_flush;
7c673cae
FG
960 list<OpRequestRef> waiting_for_scrub;
961
962 list<OpRequestRef> waiting_for_cache_not_full;
224ce89b 963 list<OpRequestRef> waiting_for_clean_to_primary_repair;
7c673cae
FG
964 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
965 waiting_for_degraded_object,
966 waiting_for_blocked_object;
967
968 set<hobject_t> objects_blocked_on_cache_full;
969 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
970 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
971
972 // Callbacks should assume pg (and nothing else) is locked
973 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
974
975 map<eversion_t,
9f95a23c
TL
976 list<
977 tuple<OpRequestRef, version_t, int,
978 vector<pg_log_op_return_item_t>>>> waiting_for_ondisk;
7c673cae
FG
979
980 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
981 void requeue_op(OpRequestRef op);
982 void requeue_ops(list<OpRequestRef> &l);
983
984 // stats that persist lazily
985 object_stat_collection_t unstable_stats;
986
987 // publish stats
9f95a23c
TL
988 ceph::mutex pg_stats_publish_lock =
989 ceph::make_mutex("PG::pg_stats_publish_lock");
7c673cae
FG
990 bool pg_stats_publish_valid;
991 pg_stat_t pg_stats_publish;
992
a8e16298 993 friend class TestOpsSocketHook;
9f95a23c 994 void publish_stats_to_osd() override;
7c673cae 995
9f95a23c
TL
996 bool needs_recovery() const {
997 return recovery_state.needs_recovery();
7c673cae 998 }
9f95a23c
TL
999 bool needs_backfill() const {
1000 return recovery_state.needs_backfill();
7c673cae 1001 }
7c673cae
FG
1002
1003 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
7c673cae
FG
1004
1005 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1006 PG *pg;
1007 ObjectStore::Transaction *t;
1008 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1009
1010 // LogEntryHandler
1011 void remove(const hobject_t &hoid) override {
1012 pg->get_pgbackend()->remove(hoid, t);
1013 }
1014 void try_stash(const hobject_t &hoid, version_t v) override {
1015 pg->get_pgbackend()->try_stash(hoid, v, t);
1016 }
1017 void rollback(const pg_log_entry_t &entry) override {
11fdf7f2 1018 ceph_assert(entry.can_rollback());
7c673cae
FG
1019 pg->get_pgbackend()->rollback(entry, t);
1020 }
1021 void rollforward(const pg_log_entry_t &entry) override {
1022 pg->get_pgbackend()->rollforward(entry, t);
1023 }
1024 void trim(const pg_log_entry_t &entry) override {
1025 pg->get_pgbackend()->trim(entry, t);
1026 }
1027 };
1028
1029 void update_object_snap_mapping(
1030 ObjectStore::Transaction *t, const hobject_t &soid,
1031 const set<snapid_t> &snaps);
1032 void clear_object_snap_mapping(
1033 ObjectStore::Transaction *t, const hobject_t &soid);
1034 void remove_snap_mapped_object(
1035 ObjectStore::Transaction& t, const hobject_t& soid);
7c673cae
FG
1036
1037 bool have_unfound() const {
9f95a23c 1038 return recovery_state.have_unfound();
7c673cae
FG
1039 }
1040 uint64_t get_num_unfound() const {
9f95a23c 1041 return recovery_state.get_num_unfound();
81eedcae 1042 }
7c673cae
FG
1043
1044 virtual void check_local() = 0;
1045
7c673cae
FG
1046 void purge_strays();
1047
9f95a23c 1048 void update_heartbeat_peers(set<int> peers) override;
7c673cae
FG
1049
1050 Context *finish_sync_event;
1051
11fdf7f2 1052 Context *finish_recovery();
7c673cae 1053 void _finish_recovery(Context *c);
11fdf7f2
TL
1054 struct C_PG_FinishRecovery : public Context {
1055 PGRef pg;
1056 explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1057 void finish(int r) override {
1058 pg->_finish_recovery(this);
1059 }
1060 };
7c673cae
FG
1061 void cancel_recovery();
1062 void clear_recovery_state();
1063 virtual void _clear_recovery_state() = 0;
7c673cae
FG
1064 void start_recovery_op(const hobject_t& soid);
1065 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1066
7c673cae
FG
1067 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1068
1069 friend class C_OSD_RepModify_Commit;
11fdf7f2 1070 friend class C_DeleteMore;
7c673cae
FG
1071
1072 // -- backoff --
9f95a23c
TL
1073 ceph::mutex backoff_lock = // orders inside Backoff::lock
1074 ceph::make_mutex("PG::backoff_lock");
1075 map<hobject_t,set<ceph::ref_t<Backoff>>> backoffs;
7c673cae 1076
9f95a23c 1077 void add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end);
7c673cae
FG
1078 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1079 void release_backoffs(const hobject_t& o) {
1080 release_backoffs(o, o);
1081 }
1082 void clear_backoffs();
1083
9f95a23c 1084 void add_pg_backoff(const ceph::ref_t<Session>& s) {
7c673cae
FG
1085 hobject_t begin = info.pgid.pgid.get_hobj_start();
1086 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1087 add_backoff(s, begin, end);
1088 }
eafe8130 1089public:
7c673cae
FG
1090 void release_pg_backoffs() {
1091 hobject_t begin = info.pgid.pgid.get_hobj_start();
1092 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1093 release_backoffs(begin, end);
1094 }
eafe8130 1095protected:
7c673cae 1096
7c673cae 1097 // -- scrub --
11fdf7f2 1098public:
7c673cae
FG
1099 struct Scrubber {
1100 Scrubber();
1101 ~Scrubber();
1102
1103 // metadata
1104 set<pg_shard_t> reserved_peers;
eafe8130 1105 bool local_reserved, remote_reserved, reserve_failed;
7c673cae
FG
1106 epoch_t epoch_start;
1107
1108 // common to both scrubs
1109 bool active;
7c673cae
FG
1110 set<pg_shard_t> waiting_on_whom;
1111 int shallow_errors;
1112 int deep_errors;
1113 int fixed;
1114 ScrubMap primary_scrubmap;
28e407b8
AA
1115 ScrubMapBuilder primary_scrubmap_pos;
1116 epoch_t replica_scrub_start = 0;
1117 ScrubMap replica_scrubmap;
1118 ScrubMapBuilder replica_scrubmap_pos;
7c673cae
FG
1119 map<pg_shard_t, ScrubMap> received_maps;
1120 OpRequestRef active_rep_scrub;
1121 utime_t scrub_reg_stamp; // stamp we registered for
1122
494da23a
TL
1123 static utime_t scrub_must_stamp() { return utime_t(0,1); }
1124
11fdf7f2
TL
1125 omap_stat_t omap_stats = (const struct omap_stat_t){ 0 };
1126
7c673cae
FG
1127 // For async sleep
1128 bool sleeping = false;
1129 bool needs_sleep = true;
1130 utime_t sleep_start;
1131
1132 // flags to indicate explicitly requested scrubs (by admin)
494da23a 1133 bool must_scrub, must_deep_scrub, must_repair, need_auto;
7c673cae
FG
1134
1135 // Priority to use for scrub scheduling
11fdf7f2 1136 unsigned priority = 0;
7c673cae 1137
494da23a 1138 bool time_for_deep;
7c673cae
FG
1139 // this flag indicates whether we would like to do auto-repair of the PG or not
1140 bool auto_repair;
11fdf7f2
TL
1141 // this flag indicates that we are scrubbing post repair to verify everything is fixed
1142 bool check_repair;
1143 // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1144 // we should deep scrub in order to auto repair
1145 bool deep_scrub_on_error;
7c673cae
FG
1146
1147 // Maps from objects with errors to missing/inconsistent peers
1148 map<hobject_t, set<pg_shard_t>> missing;
1149 map<hobject_t, set<pg_shard_t>> inconsistent;
1150
1151 // Map from object with errors to good peers
1152 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1153
1154 // Cleaned map pending snap metadata scrub
1155 ScrubMap cleaned_meta_map;
1156
28e407b8
AA
1157 void clean_meta_map(ScrubMap &for_meta_scrub) {
1158 if (end.is_max() ||
1159 cleaned_meta_map.objects.empty()) {
1160 cleaned_meta_map.swap(for_meta_scrub);
1161 } else {
1162 auto iter = cleaned_meta_map.objects.end();
1163 --iter; // not empty, see if clause
1164 auto begin = cleaned_meta_map.objects.begin();
1165 if (iter->first.has_snapset()) {
1166 ++iter;
1167 } else {
1168 while (iter != begin) {
1169 auto next = iter--;
1170 if (next->first.get_head() != iter->first.get_head()) {
1171 ++iter;
1172 break;
1173 }
1174 }
1175 }
1176 for_meta_scrub.objects.insert(begin, iter);
1177 cleaned_meta_map.objects.erase(begin, iter);
1178 }
1179 }
1180
7c673cae
FG
1181 // digest updates which we are waiting on
1182 int num_digest_updates_pending;
1183
1184 // chunky scrub
28e407b8
AA
1185 hobject_t start, end; // [start,end)
1186 hobject_t max_end; // Largest end that may have been sent to replicas
7c673cae
FG
1187 eversion_t subset_last_update;
1188
1189 // chunky scrub state
1190 enum State {
1191 INACTIVE,
1192 NEW_CHUNK,
1193 WAIT_PUSHES,
1194 WAIT_LAST_UPDATE,
1195 BUILD_MAP,
28e407b8 1196 BUILD_MAP_DONE,
7c673cae
FG
1197 WAIT_REPLICAS,
1198 COMPARE_MAPS,
1199 WAIT_DIGEST_UPDATES,
1200 FINISH,
28e407b8 1201 BUILD_MAP_REPLICA,
7c673cae
FG
1202 } state;
1203
1204 std::unique_ptr<Scrub::Store> store;
1205 // deep scrub
1206 bool deep;
28e407b8
AA
1207 int preempt_left;
1208 int preempt_divisor;
7c673cae
FG
1209
1210 list<Context*> callbacks;
1211 void add_callback(Context *context) {
1212 callbacks.push_back(context);
1213 }
1214 void run_callbacks() {
1215 list<Context*> to_run;
1216 to_run.swap(callbacks);
1217 for (list<Context*>::iterator i = to_run.begin();
1218 i != to_run.end();
1219 ++i) {
1220 (*i)->complete(0);
1221 }
1222 }
1223
1224 static const char *state_string(const PG::Scrubber::State& state) {
1225 const char *ret = NULL;
1226 switch( state )
1227 {
1228 case INACTIVE: ret = "INACTIVE"; break;
1229 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1230 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1231 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1232 case BUILD_MAP: ret = "BUILD_MAP"; break;
28e407b8 1233 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
7c673cae
FG
1234 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1235 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1236 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1237 case FINISH: ret = "FINISH"; break;
28e407b8 1238 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
7c673cae
FG
1239 }
1240 return ret;
1241 }
1242
1243 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1244
7c673cae
FG
1245 // clear all state
1246 void reset() {
1247 active = false;
7c673cae
FG
1248 waiting_on_whom.clear();
1249 if (active_rep_scrub) {
1250 active_rep_scrub = OpRequestRef();
1251 }
1252 received_maps.clear();
1253
1254 must_scrub = false;
1255 must_deep_scrub = false;
1256 must_repair = false;
494da23a
TL
1257 need_auto = false;
1258 time_for_deep = false;
7c673cae 1259 auto_repair = false;
11fdf7f2
TL
1260 check_repair = false;
1261 deep_scrub_on_error = false;
7c673cae
FG
1262
1263 state = PG::Scrubber::INACTIVE;
1264 start = hobject_t();
1265 end = hobject_t();
28e407b8 1266 max_end = hobject_t();
7c673cae
FG
1267 subset_last_update = eversion_t();
1268 shallow_errors = 0;
1269 deep_errors = 0;
1270 fixed = 0;
11fdf7f2 1271 omap_stats = (const struct omap_stat_t){ 0 };
7c673cae 1272 deep = false;
7c673cae
FG
1273 run_callbacks();
1274 inconsistent.clear();
1275 missing.clear();
1276 authoritative.clear();
1277 num_digest_updates_pending = 0;
28e407b8
AA
1278 primary_scrubmap = ScrubMap();
1279 primary_scrubmap_pos.reset();
1280 replica_scrubmap = ScrubMap();
1281 replica_scrubmap_pos.reset();
7c673cae
FG
1282 cleaned_meta_map = ScrubMap();
1283 sleeping = false;
1284 needs_sleep = true;
1285 sleep_start = utime_t();
1286 }
1287
1288 void create_results(const hobject_t& obj);
1289 void cleanup_store(ObjectStore::Transaction *t);
1290 } scrubber;
1291
11fdf7f2 1292protected:
7c673cae
FG
1293 bool scrub_after_recovery;
1294
1295 int active_pushes;
1296
28e407b8
AA
1297 bool scrub_can_preempt = false;
1298 bool scrub_preempted = false;
1299
1300 // we allow some number of preemptions of the scrub, which mean we do
1301 // not block. then we start to block. once we start blocking, we do
1302 // not stop until the scrub range is completed.
1303 bool write_blocked_by_scrub(const hobject_t &soid);
1304
1305 /// true if the given range intersects the scrub interval in any way
1306 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1307
7c673cae 1308 void repair_object(
9f95a23c
TL
1309 const hobject_t &soid,
1310 const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
1311 const set<pg_shard_t> &bad_peers);
7c673cae 1312
7c673cae
FG
1313 void chunky_scrub(ThreadPool::TPHandle &handle);
1314 void scrub_compare_maps();
1315 /**
1316 * return true if any inconsistency/missing is repaired, false otherwise
1317 */
1318 bool scrub_process_inconsistent();
31f18b77 1319 bool ops_blocked_by_scrub() const;
7c673cae 1320 void scrub_finish();
11fdf7f2 1321 void scrub_clear_state(bool keep_repair = false);
7c673cae 1322 void _scan_snaps(ScrubMap &map);
224ce89b 1323 void _repair_oinfo_oid(ScrubMap &map);
11fdf7f2 1324 void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
7c673cae
FG
1325 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1326 hobject_t start, hobject_t end, bool deep,
28e407b8 1327 bool allow_preemption);
7c673cae
FG
1328 int build_scrub_map_chunk(
1329 ScrubMap &map,
28e407b8
AA
1330 ScrubMapBuilder &pos,
1331 hobject_t start, hobject_t end, bool deep,
7c673cae
FG
1332 ThreadPool::TPHandle &handle);
1333 /**
1334 * returns true if [begin, end) is good to scrub at this time
1335 * a false return value obliges the implementer to requeue scrub when the
1336 * condition preventing scrub clears
1337 */
1338 virtual bool _range_available_for_scrub(
1339 const hobject_t &begin, const hobject_t &end) = 0;
1340 virtual void scrub_snapshot_metadata(
1341 ScrubMap &map,
28e407b8 1342 const std::map<hobject_t,
9f95a23c
TL
1343 pair<std::optional<uint32_t>,
1344 std::optional<uint32_t>>> &missing_digest) { }
7c673cae
FG
1345 virtual void _scrub_clear_state() { }
1346 virtual void _scrub_finish() { }
7c673cae
FG
1347 void clear_scrub_reserved();
1348 void scrub_reserve_replicas();
1349 void scrub_unreserve_replicas();
1350 bool scrub_all_replicas_reserved() const;
7c673cae
FG
1351
1352 void replica_scrub(
1353 OpRequestRef op,
1354 ThreadPool::TPHandle &handle);
1355 void do_replica_scrub_map(OpRequestRef op);
7c673cae
FG
1356
1357 void handle_scrub_reserve_request(OpRequestRef op);
1358 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1359 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1360 void handle_scrub_reserve_release(OpRequestRef op);
1361
7c673cae
FG
1362 // -- recovery state --
1363
7c673cae
FG
1364 struct QueuePeeringEvt : Context {
1365 PGRef pg;
9f95a23c
TL
1366 PGPeeringEventRef evt;
1367
1368 template <class EVT>
7c673cae 1369 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
9f95a23c
TL
1370 pg(pg), evt(std::make_shared<PGPeeringEvent>(epoch, epoch, evt)) {}
1371
1372 QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) :
1373 pg(pg), evt(std::move(evt)) {}
1374
7c673cae
FG
1375 void finish(int r) override {
1376 pg->lock();
9f95a23c 1377 pg->queue_peering_event(std::move(evt));
7c673cae
FG
1378 pg->unlock();
1379 }
1380 };
1381
7c673cae 1382
11fdf7f2
TL
1383public:
1384 int pg_stat_adjust(osd_stat_t *new_stat);
1385protected:
11fdf7f2 1386 bool delete_needs_sleep = false;
a8e16298 1387
11fdf7f2 1388protected:
9f95a23c
TL
1389 bool state_test(uint64_t m) const { return recovery_state.state_test(m); }
1390 void state_set(uint64_t m) { recovery_state.state_set(m); }
1391 void state_clear(uint64_t m) { recovery_state.state_clear(m); }
1392
1393 bool is_complete() const {
1394 return recovery_state.is_complete();
1395 }
1396 bool should_send_notify() const {
1397 return recovery_state.should_send_notify();
1398 }
1399
1400 bool is_active() const { return recovery_state.is_active(); }
1401 bool is_activating() const { return recovery_state.is_activating(); }
1402 bool is_peering() const { return recovery_state.is_peering(); }
1403 bool is_down() const { return recovery_state.is_down(); }
1404 bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); }
1405 bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); }
1406 bool is_incomplete() const { return recovery_state.is_incomplete(); }
1407 bool is_clean() const { return recovery_state.is_clean(); }
1408 bool is_degraded() const { return recovery_state.is_degraded(); }
1409 bool is_undersized() const { return recovery_state.is_undersized(); }
11fdf7f2 1410 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
9f95a23c
TL
1411 bool is_remapped() const { return recovery_state.is_remapped(); }
1412 bool is_peered() const { return recovery_state.is_peered(); }
1413 bool is_recovering() const { return recovery_state.is_recovering(); }
1414 bool is_premerge() const { return recovery_state.is_premerge(); }
1415 bool is_repair() const { return recovery_state.is_repair(); }
1416 bool is_laggy() const { return state_test(PG_STATE_LAGGY); }
1417 bool is_wait() const { return state_test(PG_STATE_WAIT); }
7c673cae 1418
9f95a23c 1419 bool is_empty() const { return recovery_state.is_empty(); }
7c673cae
FG
1420
1421 // pg on-disk state
1422 void do_pending_flush();
1423
11fdf7f2 1424public:
9f95a23c 1425 virtual void prepare_write(
7c673cae
FG
1426 pg_info_t &info,
1427 pg_info_t &last_written_info,
1428 PastIntervals &past_intervals,
9f95a23c
TL
1429 PGLog &pglog,
1430 bool dirty_info,
7c673cae 1431 bool dirty_big_info,
9f95a23c
TL
1432 bool need_write_epoch,
1433 ObjectStore::Transaction &t) override;
11fdf7f2 1434
9f95a23c
TL
1435 void write_if_dirty(PeeringCtx &rctx) {
1436 write_if_dirty(rctx.transaction);
11fdf7f2
TL
1437 }
1438protected:
9f95a23c
TL
1439 void write_if_dirty(ObjectStore::Transaction& t) {
1440 recovery_state.write_if_dirty(t);
1441 }
7c673cae
FG
1442
1443 PGLog::IndexedLog projected_log;
1444 bool check_in_progress_op(
1445 const osd_reqid_t &r,
1446 eversion_t *version,
1447 version_t *user_version,
9f95a23c
TL
1448 int *return_code,
1449 vector<pg_log_op_return_item_t> *op_returns) const;
7c673cae
FG
1450 eversion_t projected_last_update;
1451 eversion_t get_next_version() const {
1452 eversion_t at_version(
11fdf7f2 1453 get_osdmap_epoch(),
7c673cae 1454 projected_last_update.version+1);
11fdf7f2 1455 ceph_assert(at_version > info.last_update);
9f95a23c 1456 ceph_assert(at_version > recovery_state.get_pg_log().get_head());
11fdf7f2 1457 ceph_assert(at_version > projected_last_update);
7c673cae
FG
1458 return at_version;
1459 }
1460
7c673cae 1461 bool check_log_for_corruption(ObjectStore *store);
7c673cae
FG
1462
1463 std::string get_corrupt_pg_log_name() const;
11fdf7f2 1464
7c673cae
FG
1465 void update_snap_map(
1466 const vector<pg_log_entry_t> &log_entries,
1467 ObjectStore::Transaction& t);
1468
1469 void filter_snapc(vector<snapid_t> &snaps);
1470
7c673cae
FG
1471 virtual void kick_snap_trim() = 0;
1472 virtual void snap_trimmer_scrub_complete() = 0;
31f18b77 1473 bool requeue_scrub(bool high_priority = false);
c07f9fc5 1474 void queue_recovery();
7c673cae
FG
1475 bool queue_scrub();
1476 unsigned get_scrub_priority();
1477
9f95a23c
TL
1478 bool try_flush_or_schedule_async() override;
1479 void start_flush_on_transaction(
1480 ObjectStore::Transaction &t) override;
7c673cae 1481
9f95a23c
TL
1482 void update_history(const pg_history_t& history) {
1483 recovery_state.update_history(history);
1484 }
7c673cae
FG
1485
1486 // OpRequest queueing
1487 bool can_discard_op(OpRequestRef& op);
1488 bool can_discard_scan(OpRequestRef op);
1489 bool can_discard_backfill(OpRequestRef op);
1490 bool can_discard_request(OpRequestRef& op);
1491
1492 template<typename T, int MSGTYPE>
1493 bool can_discard_replica_op(OpRequestRef& op);
1494
1495 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
11fdf7f2 1496 bool old_peering_evt(PGPeeringEventRef evt) {
7c673cae
FG
1497 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
1498 }
7c673cae 1499 bool have_same_or_newer_map(epoch_t e) {
11fdf7f2 1500 return e <= get_osdmap_epoch();
7c673cae
FG
1501 }
1502
1503 bool op_has_sufficient_caps(OpRequestRef& op);
1504
7c673cae 1505 // abstract bits
11fdf7f2 1506 friend class FlushState;
7c673cae 1507
9f95a23c
TL
1508 friend ostream& operator<<(ostream& out, const PG& pg);
1509
92f5a8d4 1510protected:
9f95a23c 1511 PeeringState recovery_state;
7c673cae 1512
9f95a23c
TL
1513 // ref to recovery_state.pool
1514 const PGPool &pool;
1515
1516 // ref to recovery_state.info
1517 const pg_info_t &info;
7c673cae
FG
1518};
1519
7c673cae
FG
1520
1521ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
1522
1523#endif