]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #ifndef CEPH_PG_H | |
16 | #define CEPH_PG_H | |
17 | ||
18 | #include <boost/statechart/custom_reaction.hpp> | |
19 | #include <boost/statechart/event.hpp> | |
20 | #include <boost/statechart/simple_state.hpp> | |
21 | #include <boost/statechart/state.hpp> | |
22 | #include <boost/statechart/state_machine.hpp> | |
23 | #include <boost/statechart/transition.hpp> | |
24 | #include <boost/statechart/event_base.hpp> | |
25 | #include <boost/scoped_ptr.hpp> | |
91327a77 | 26 | #include <boost/container/flat_set.hpp> |
7c673cae FG |
27 | #include "include/mempool.h" |
28 | ||
29 | // re-include our assert to clobber boost's | |
11fdf7f2 | 30 | #include "include/ceph_assert.h" |
9f95a23c | 31 | #include "include/common_fwd.h" |
7c673cae FG |
32 | |
33 | #include "include/types.h" | |
34 | #include "include/stringify.h" | |
35 | #include "osd_types.h" | |
36 | #include "include/xlist.h" | |
37 | #include "SnapMapper.h" | |
38 | #include "Session.h" | |
39 | #include "common/Timer.h" | |
40 | ||
41 | #include "PGLog.h" | |
42 | #include "OSDMap.h" | |
43 | #include "messages/MOSDPGLog.h" | |
44 | #include "include/str_list.h" | |
45 | #include "PGBackend.h" | |
11fdf7f2 | 46 | #include "PGPeeringEvent.h" |
9f95a23c TL |
47 | #include "PeeringState.h" |
48 | #include "MissingLoc.h" | |
11fdf7f2 TL |
49 | |
50 | #include "mgr/OSDPerfMetricTypes.h" | |
7c673cae FG |
51 | |
52 | #include <atomic> | |
53 | #include <list> | |
54 | #include <memory> | |
7c673cae FG |
55 | #include <string> |
56 | #include <tuple> | |
7c673cae FG |
57 | |
58 | //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs | |
11fdf7f2 | 59 | //#define PG_DEBUG_REFS // track provenance of pg refs, helpful for finding leaks |
7c673cae FG |
60 | |
61 | class OSD; | |
62 | class OSDService; | |
11fdf7f2 TL |
63 | class OSDShard; |
64 | class OSDShardPGSlot; | |
7c673cae FG |
65 | class MOSDOp; |
66 | class MOSDPGScan; | |
67 | class MOSDPGBackfill; | |
68 | class MOSDPGInfo; | |
69 | ||
70 | class PG; | |
71 | struct OpRequest; | |
72 | typedef OpRequest::Ref OpRequestRef; | |
73 | class MOSDPGLog; | |
11fdf7f2 | 74 | class DynamicPerfStats; |
7c673cae FG |
75 | |
76 | namespace Scrub { | |
77 | class Store; | |
78 | } | |
79 | ||
7c673cae FG |
80 | #ifdef PG_DEBUG_REFS |
81 | #include "common/tracked_int_ptr.hpp" | |
82 | uint64_t get_with_id(PG *pg); | |
83 | void put_with_id(PG *pg, uint64_t id); | |
84 | typedef TrackedIntPtr<PG> PGRef; | |
85 | #else | |
86 | typedef boost::intrusive_ptr<PG> PGRef; | |
87 | #endif | |
88 | ||
89 | class PGRecoveryStats { | |
90 | struct per_state_info { | |
91 | uint64_t enter, exit; // enter/exit counts | |
92 | uint64_t events; | |
93 | utime_t event_time; // time spent processing events | |
94 | utime_t total_time; // total time in state | |
95 | utime_t min_time, max_time; | |
96 | ||
97 | // cppcheck-suppress unreachableCode | |
98 | per_state_info() : enter(0), exit(0), events(0) {} | |
99 | }; | |
100 | map<const char *,per_state_info> info; | |
9f95a23c | 101 | ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock"); |
7c673cae FG |
102 | |
103 | public: | |
9f95a23c | 104 | PGRecoveryStats() = default; |
7c673cae FG |
105 | |
106 | void reset() { | |
11fdf7f2 | 107 | std::lock_guard l(lock); |
7c673cae FG |
108 | info.clear(); |
109 | } | |
110 | void dump(ostream& out) { | |
11fdf7f2 | 111 | std::lock_guard l(lock); |
7c673cae FG |
112 | for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) { |
113 | per_state_info& i = p->second; | |
114 | out << i.enter << "\t" << i.exit << "\t" | |
115 | << i.events << "\t" << i.event_time << "\t" | |
116 | << i.total_time << "\t" | |
117 | << i.min_time << "\t" << i.max_time << "\t" | |
118 | << p->first << "\n"; | |
119 | } | |
120 | } | |
121 | ||
122 | void dump_formatted(Formatter *f) { | |
11fdf7f2 | 123 | std::lock_guard l(lock); |
7c673cae FG |
124 | f->open_array_section("pg_recovery_stats"); |
125 | for (map<const char *,per_state_info>::iterator p = info.begin(); | |
126 | p != info.end(); ++p) { | |
127 | per_state_info& i = p->second; | |
128 | f->open_object_section("recovery_state"); | |
129 | f->dump_int("enter", i.enter); | |
130 | f->dump_int("exit", i.exit); | |
131 | f->dump_int("events", i.events); | |
132 | f->dump_stream("event_time") << i.event_time; | |
133 | f->dump_stream("total_time") << i.total_time; | |
134 | f->dump_stream("min_time") << i.min_time; | |
135 | f->dump_stream("max_time") << i.max_time; | |
136 | vector<string> states; | |
137 | get_str_vec(p->first, "/", states); | |
138 | f->open_array_section("nested_states"); | |
139 | for (vector<string>::iterator st = states.begin(); | |
140 | st != states.end(); ++st) { | |
141 | f->dump_string("state", *st); | |
142 | } | |
143 | f->close_section(); | |
144 | f->close_section(); | |
145 | } | |
146 | f->close_section(); | |
147 | } | |
148 | ||
149 | void log_enter(const char *s) { | |
11fdf7f2 | 150 | std::lock_guard l(lock); |
7c673cae FG |
151 | info[s].enter++; |
152 | } | |
153 | void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) { | |
11fdf7f2 | 154 | std::lock_guard l(lock); |
7c673cae FG |
155 | per_state_info &i = info[s]; |
156 | i.exit++; | |
157 | i.total_time += dur; | |
158 | if (dur > i.max_time) | |
159 | i.max_time = dur; | |
160 | if (dur < i.min_time || i.min_time == utime_t()) | |
161 | i.min_time = dur; | |
162 | i.events += events; | |
163 | i.event_time += event_dur; | |
164 | } | |
165 | }; | |
166 | ||
7c673cae FG |
167 | /** PG - Replica Placement Group |
168 | * | |
169 | */ | |
170 | ||
9f95a23c TL |
171 | class PG : public DoutPrefixProvider, public PeeringState::PeeringListener { |
172 | friend class NamedState; | |
173 | friend class PeeringState; | |
174 | ||
a8e16298 | 175 | public: |
9f95a23c | 176 | const pg_shard_t pg_whoami; |
11fdf7f2 | 177 | const spg_t pg_id; |
9f95a23c TL |
178 | |
179 | public: | |
180 | // -- members -- | |
11fdf7f2 TL |
181 | const coll_t coll; |
182 | ||
183 | ObjectStore::CollectionHandle ch; | |
184 | ||
11fdf7f2 TL |
185 | // -- methods -- |
186 | std::ostream& gen_prefix(std::ostream& out) const override; | |
187 | CephContext *get_cct() const override { | |
188 | return cct; | |
189 | } | |
190 | unsigned get_subsys() const override { | |
191 | return ceph_subsys_osd; | |
192 | } | |
193 | ||
9f95a23c TL |
194 | const char* const get_current_state() const { |
195 | return recovery_state.get_current_state(); | |
196 | } | |
197 | ||
11fdf7f2 TL |
198 | const OSDMapRef& get_osdmap() const { |
199 | ceph_assert(is_locked()); | |
9f95a23c | 200 | return recovery_state.get_osdmap(); |
11fdf7f2 | 201 | } |
9f95a23c TL |
202 | |
203 | epoch_t get_osdmap_epoch() const override final { | |
204 | return recovery_state.get_osdmap()->get_epoch(); | |
11fdf7f2 TL |
205 | } |
206 | ||
9f95a23c TL |
207 | PerfCounters &get_peering_perf() override; |
208 | PerfCounters &get_perf_logger() override; | |
209 | void log_state_enter(const char *state) override; | |
210 | void log_state_exit( | |
211 | const char *state_name, utime_t enter_time, | |
212 | uint64_t events, utime_t event_dur) override; | |
213 | ||
11fdf7f2 TL |
214 | void lock_suspend_timeout(ThreadPool::TPHandle &handle) { |
215 | handle.suspend_tp_timeout(); | |
216 | lock(); | |
217 | handle.reset_tp_timeout(); | |
218 | } | |
219 | void lock(bool no_lockdep = false) const; | |
9f95a23c TL |
220 | void unlock() const; |
221 | bool is_locked() const; | |
11fdf7f2 TL |
222 | |
223 | const spg_t& get_pgid() const { | |
224 | return pg_id; | |
225 | } | |
226 | ||
227 | const PGPool& get_pool() const { | |
228 | return pool; | |
229 | } | |
230 | uint64_t get_last_user_version() const { | |
231 | return info.last_user_version; | |
232 | } | |
233 | const pg_history_t& get_history() const { | |
234 | return info.history; | |
235 | } | |
236 | bool get_need_up_thru() const { | |
9f95a23c | 237 | return recovery_state.get_need_up_thru(); |
11fdf7f2 TL |
238 | } |
239 | epoch_t get_same_interval_since() const { | |
240 | return info.history.same_interval_since; | |
241 | } | |
242 | ||
9f95a23c TL |
243 | static void set_last_scrub_stamp( |
244 | utime_t t, pg_history_t &history, pg_stat_t &stats) { | |
245 | stats.last_scrub_stamp = t; | |
246 | history.last_scrub_stamp = t; | |
247 | } | |
248 | ||
11fdf7f2 | 249 | void set_last_scrub_stamp(utime_t t) { |
9f95a23c TL |
250 | recovery_state.update_stats( |
251 | [=](auto &history, auto &stats) { | |
252 | set_last_scrub_stamp(t, history, stats); | |
253 | return true; | |
254 | }); | |
255 | } | |
256 | ||
257 | static void set_last_deep_scrub_stamp( | |
258 | utime_t t, pg_history_t &history, pg_stat_t &stats) { | |
259 | stats.last_deep_scrub_stamp = t; | |
260 | history.last_deep_scrub_stamp = t; | |
11fdf7f2 TL |
261 | } |
262 | ||
263 | void set_last_deep_scrub_stamp(utime_t t) { | |
9f95a23c TL |
264 | recovery_state.update_stats( |
265 | [=](auto &history, auto &stats) { | |
266 | set_last_deep_scrub_stamp(t, history, stats); | |
267 | return true; | |
268 | }); | |
11fdf7f2 TL |
269 | } |
270 | ||
271 | bool is_deleting() const { | |
9f95a23c | 272 | return recovery_state.is_deleting(); |
11fdf7f2 TL |
273 | } |
274 | bool is_deleted() const { | |
9f95a23c | 275 | return recovery_state.is_deleted(); |
11fdf7f2 | 276 | } |
9f95a23c TL |
277 | bool is_nonprimary() const { |
278 | return recovery_state.is_nonprimary(); | |
11fdf7f2 TL |
279 | } |
280 | bool is_primary() const { | |
9f95a23c | 281 | return recovery_state.is_primary(); |
11fdf7f2 TL |
282 | } |
283 | bool pg_has_reset_since(epoch_t e) { | |
284 | ceph_assert(is_locked()); | |
9f95a23c | 285 | return recovery_state.pg_has_reset_since(e); |
11fdf7f2 TL |
286 | } |
287 | ||
288 | bool is_ec_pg() const { | |
9f95a23c | 289 | return recovery_state.is_ec_pg(); |
11fdf7f2 TL |
290 | } |
291 | int get_role() const { | |
9f95a23c | 292 | return recovery_state.get_role(); |
11fdf7f2 TL |
293 | } |
294 | const vector<int> get_acting() const { | |
9f95a23c TL |
295 | return recovery_state.get_acting(); |
296 | } | |
297 | const set<pg_shard_t> &get_actingset() const { | |
298 | return recovery_state.get_actingset(); | |
11fdf7f2 TL |
299 | } |
300 | int get_acting_primary() const { | |
9f95a23c | 301 | return recovery_state.get_acting_primary(); |
11fdf7f2 TL |
302 | } |
303 | pg_shard_t get_primary() const { | |
9f95a23c | 304 | return recovery_state.get_primary(); |
11fdf7f2 TL |
305 | } |
306 | const vector<int> get_up() const { | |
9f95a23c | 307 | return recovery_state.get_up(); |
11fdf7f2 TL |
308 | } |
309 | int get_up_primary() const { | |
9f95a23c | 310 | return recovery_state.get_up_primary(); |
11fdf7f2 TL |
311 | } |
312 | const PastIntervals& get_past_intervals() const { | |
9f95a23c TL |
313 | return recovery_state.get_past_intervals(); |
314 | } | |
315 | bool is_acting_recovery_backfill(pg_shard_t osd) const { | |
316 | return recovery_state.is_acting_recovery_backfill(osd); | |
317 | } | |
318 | const set<pg_shard_t> &get_acting_recovery_backfill() const { | |
319 | return recovery_state.get_acting_recovery_backfill(); | |
320 | } | |
321 | bool is_acting(pg_shard_t osd) const { | |
322 | return recovery_state.is_acting(osd); | |
323 | } | |
324 | bool is_up(pg_shard_t osd) const { | |
325 | return recovery_state.is_up(osd); | |
326 | } | |
327 | static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) { | |
328 | return PeeringState::has_shard(ec, v, osd); | |
11fdf7f2 TL |
329 | } |
330 | ||
331 | /// initialize created PG | |
332 | void init( | |
333 | int role, | |
334 | const vector<int>& up, | |
335 | int up_primary, | |
336 | const vector<int>& acting, | |
337 | int acting_primary, | |
338 | const pg_history_t& history, | |
339 | const PastIntervals& pim, | |
340 | bool backfill, | |
9f95a23c | 341 | ObjectStore::Transaction &t); |
11fdf7f2 TL |
342 | |
343 | /// read existing pg state off disk | |
344 | void read_state(ObjectStore *store); | |
345 | static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch); | |
346 | ||
347 | static int get_latest_struct_v() { | |
9f95a23c | 348 | return pg_latest_struct_v; |
11fdf7f2 TL |
349 | } |
350 | static int get_compat_struct_v() { | |
9f95a23c | 351 | return pg_compat_struct_v; |
11fdf7f2 TL |
352 | } |
353 | static int read_info( | |
354 | ObjectStore *store, spg_t pgid, const coll_t &coll, | |
355 | pg_info_t &info, PastIntervals &past_intervals, | |
356 | __u8 &); | |
357 | static bool _has_removal_flag(ObjectStore *store, spg_t pgid); | |
358 | ||
9f95a23c | 359 | void rm_backoff(const ceph::ref_t<Backoff>& b); |
11fdf7f2 TL |
360 | |
361 | void update_snap_mapper_bits(uint32_t bits) { | |
362 | snap_mapper.update_bits(bits); | |
363 | } | |
364 | void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v); | |
365 | virtual void split_colls( | |
366 | spg_t child, | |
367 | int split_bits, | |
368 | int seed, | |
369 | const pg_pool_t *pool, | |
9f95a23c | 370 | ObjectStore::Transaction &t) = 0; |
11fdf7f2 | 371 | void split_into(pg_t child_pgid, PG *child, unsigned split_bits); |
9f95a23c | 372 | void merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx, |
11fdf7f2 TL |
373 | unsigned split_bits, |
374 | const pg_merge_meta_t& last_pg_merge_meta); | |
9f95a23c TL |
375 | void finish_split_stats(const object_stat_sum_t& stats, |
376 | ObjectStore::Transaction &t); | |
11fdf7f2 TL |
377 | |
378 | void scrub(epoch_t queued, ThreadPool::TPHandle &handle); | |
494da23a TL |
379 | |
380 | bool is_scrub_registered(); | |
11fdf7f2 TL |
381 | void reg_next_scrub(); |
382 | void unreg_next_scrub(); | |
383 | ||
9f95a23c TL |
384 | void queue_want_pg_temp(const vector<int> &wanted) override; |
385 | void clear_want_pg_temp() override; | |
386 | ||
387 | void on_new_interval() override; | |
388 | ||
389 | void on_role_change() override; | |
390 | virtual void plpg_on_role_change() = 0; | |
391 | ||
392 | void init_collection_pool_opts(); | |
393 | void on_pool_change() override; | |
394 | virtual void plpg_on_pool_change() = 0; | |
395 | ||
396 | void on_info_history_change() override; | |
397 | ||
398 | void scrub_requested(bool deep, bool repair, bool need_auto = false) override; | |
399 | ||
400 | uint64_t get_snap_trimq_size() const override { | |
401 | return snap_trimq.size(); | |
402 | } | |
403 | unsigned get_target_pg_log_entries() const override; | |
404 | ||
405 | void clear_publish_stats() override; | |
406 | void clear_primary_state() override; | |
494da23a | 407 | |
9f95a23c TL |
408 | epoch_t oldest_stored_osdmap() override; |
409 | OstreamTemp get_clog_error() override; | |
410 | OstreamTemp get_clog_info() override; | |
411 | OstreamTemp get_clog_debug() override; | |
412 | ||
413 | void schedule_event_after( | |
414 | PGPeeringEventRef event, | |
415 | float delay) override; | |
416 | void request_local_background_io_reservation( | |
417 | unsigned priority, | |
418 | PGPeeringEventRef on_grant, | |
419 | PGPeeringEventRef on_preempt) override; | |
420 | void update_local_background_io_priority( | |
421 | unsigned priority) override; | |
422 | void cancel_local_background_io_reservation() override; | |
423 | ||
424 | void request_remote_recovery_reservation( | |
425 | unsigned priority, | |
426 | PGPeeringEventRef on_grant, | |
427 | PGPeeringEventRef on_preempt) override; | |
428 | void cancel_remote_recovery_reservation() override; | |
429 | ||
430 | void schedule_event_on_commit( | |
431 | ObjectStore::Transaction &t, | |
432 | PGPeeringEventRef on_commit) override; | |
433 | ||
434 | void on_active_exit() override; | |
435 | ||
436 | Context *on_clean() override { | |
437 | if (is_active()) { | |
438 | kick_snap_trim(); | |
439 | } | |
440 | requeue_ops(waiting_for_clean_to_primary_repair); | |
441 | return finish_recovery(); | |
442 | } | |
443 | ||
444 | void on_activate(interval_set<snapid_t> snaps) override { | |
445 | ceph_assert(scrubber.callbacks.empty()); | |
446 | ceph_assert(callbacks_for_degraded_object.empty()); | |
447 | snap_trimq = snaps; | |
448 | release_pg_backoffs(); | |
449 | projected_last_update = info.last_update; | |
450 | } | |
451 | ||
452 | void on_activate_committed() override; | |
453 | ||
454 | void on_active_actmap() override; | |
455 | void on_active_advmap(const OSDMapRef &osdmap) override; | |
456 | ||
457 | void queue_snap_retrim(snapid_t snap); | |
458 | ||
459 | void on_backfill_reserved() override; | |
460 | void on_backfill_canceled() override; | |
461 | void on_recovery_reserved() override; | |
494da23a | 462 | |
11fdf7f2 | 463 | bool is_forced_recovery_or_backfill() const { |
9f95a23c TL |
464 | return recovery_state.is_forced_recovery_or_backfill(); |
465 | } | |
466 | ||
467 | PGLog::LogEntryHandlerRef get_log_handler( | |
468 | ObjectStore::Transaction &t) override { | |
469 | return std::make_unique<PG::PGLogEntryHandler>(this, &t); | |
11fdf7f2 | 470 | } |
9f95a23c TL |
471 | |
472 | void do_delete_work(ObjectStore::Transaction &t) override; | |
473 | ||
474 | void clear_ready_to_merge() override; | |
475 | void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override; | |
476 | void set_not_ready_to_merge_source(pg_t pgid) override; | |
477 | void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override; | |
478 | void set_ready_to_merge_source(eversion_t lu) override; | |
479 | ||
480 | void send_pg_created(pg_t pgid) override; | |
481 | ||
482 | ceph::signedspan get_mnow() override; | |
483 | HeartbeatStampsRef get_hb_stamps(int peer) override; | |
484 | void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override; | |
485 | void queue_check_readable(epoch_t lpr, ceph::timespan delay) override; | |
486 | ||
487 | void rebuild_missing_set_with_deletes(PGLog &pglog) override; | |
a8e16298 | 488 | |
11fdf7f2 | 489 | void queue_peering_event(PGPeeringEventRef evt); |
9f95a23c | 490 | void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx); |
11fdf7f2 TL |
491 | void queue_null(epoch_t msg_epoch, epoch_t query_epoch); |
492 | void queue_flushed(epoch_t started_at); | |
493 | void handle_advance_map( | |
494 | OSDMapRef osdmap, OSDMapRef lastmap, | |
495 | vector<int>& newup, int up_primary, | |
496 | vector<int>& newacting, int acting_primary, | |
9f95a23c TL |
497 | PeeringCtx &rctx); |
498 | void handle_activate_map(PeeringCtx &rctx); | |
499 | void handle_initialize(PeeringCtx &rxcx); | |
11fdf7f2 TL |
500 | void handle_query_state(Formatter *f); |
501 | ||
502 | /** | |
503 | * @param ops_begun returns how many recovery ops the function started | |
504 | * @returns true if any useful work was accomplished; false otherwise | |
505 | */ | |
506 | virtual bool start_recovery_ops( | |
507 | uint64_t max, | |
508 | ThreadPool::TPHandle &handle, | |
509 | uint64_t *ops_begun) = 0; | |
510 | ||
9f95a23c TL |
511 | // more work after the above, but with a PeeringCtx |
512 | void find_unfound(epoch_t queued, PeeringCtx &rctx); | |
11fdf7f2 TL |
513 | |
514 | virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0; | |
515 | ||
516 | void dump_pgstate_history(Formatter *f); | |
517 | void dump_missing(Formatter *f); | |
518 | ||
519 | void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f); | |
520 | void with_heartbeat_peers(std::function<void(int)> f); | |
521 | ||
522 | void shutdown(); | |
523 | virtual void on_shutdown() = 0; | |
524 | ||
525 | bool get_must_scrub() const { | |
526 | return scrubber.must_scrub; | |
527 | } | |
528 | bool sched_scrub(); | |
529 | ||
530 | virtual void do_request( | |
531 | OpRequestRef& op, | |
532 | ThreadPool::TPHandle &handle | |
533 | ) = 0; | |
534 | virtual void clear_cache() = 0; | |
535 | virtual int get_cache_obj_count() = 0; | |
536 | ||
537 | virtual void snap_trimmer(epoch_t epoch_queued) = 0; | |
9f95a23c TL |
538 | virtual void do_command( |
539 | const string_view& prefix, | |
540 | const cmdmap_t& cmdmap, | |
541 | const bufferlist& idata, | |
542 | std::function<void(int,const std::string&,bufferlist&)> on_finish) = 0; | |
11fdf7f2 TL |
543 | |
544 | virtual bool agent_work(int max) = 0; | |
545 | virtual bool agent_work(int max, int agent_flush_quota) = 0; | |
546 | virtual void agent_stop() = 0; | |
547 | virtual void agent_delay() = 0; | |
548 | virtual void agent_clear() = 0; | |
549 | virtual void agent_choose_mode_restart() = 0; | |
550 | ||
9f95a23c TL |
551 | struct C_DeleteMore : public Context { |
552 | PGRef pg; | |
553 | epoch_t epoch; | |
554 | C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {} | |
555 | void finish(int r) override { | |
556 | ceph_abort(); | |
557 | } | |
558 | void complete(int r) override; | |
559 | }; | |
11fdf7f2 TL |
560 | |
561 | void _delete_some(ObjectStore::Transaction *t); | |
562 | ||
563 | virtual void set_dynamic_perf_stats_queries( | |
564 | const std::list<OSDPerfMetricQuery> &queries) { | |
565 | } | |
566 | virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) { | |
567 | } | |
568 | ||
9f95a23c TL |
569 | uint64_t get_min_alloc_size() const; |
570 | ||
11fdf7f2 TL |
571 | // reference counting |
572 | #ifdef PG_DEBUG_REFS | |
573 | uint64_t get_with_id(); | |
574 | void put_with_id(uint64_t); | |
575 | void dump_live_ids(); | |
576 | #endif | |
577 | void get(const char* tag); | |
578 | void put(const char* tag); | |
579 | int get_num_ref() { | |
580 | return ref; | |
581 | } | |
582 | ||
583 | // ctor | |
584 | PG(OSDService *o, OSDMapRef curmap, | |
585 | const PGPool &pool, spg_t p); | |
586 | ~PG() override; | |
587 | ||
588 | // prevent copying | |
589 | explicit PG(const PG& rhs) = delete; | |
590 | PG& operator=(const PG& rhs) = delete; | |
591 | ||
7c673cae | 592 | protected: |
11fdf7f2 TL |
593 | // ------------- |
594 | // protected | |
7c673cae | 595 | OSDService *osd; |
11fdf7f2 TL |
596 | public: |
597 | OSDShard *osd_shard = nullptr; | |
598 | OSDShardPGSlot *pg_slot = nullptr; | |
599 | protected: | |
7c673cae | 600 | CephContext *cct; |
11fdf7f2 | 601 | |
11fdf7f2 TL |
602 | // locking and reference counting. |
603 | // I destroy myself when the reference count hits zero. | |
604 | // lock() should be called before doing anything. | |
605 | // get() should be called on pointer copy (to another thread, etc.). | |
606 | // put() should be called on destruction of some previously copied pointer. | |
607 | // unlock() when done with the current pointer (_most common_). | |
9f95a23c TL |
608 | mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock"); |
609 | #ifndef CEPH_DEBUG_MUTEX | |
610 | mutable std::thread::id locked_by; | |
611 | #endif | |
11fdf7f2 TL |
612 | std::atomic<unsigned int> ref{0}; |
613 | ||
614 | #ifdef PG_DEBUG_REFS | |
9f95a23c | 615 | ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock"); |
11fdf7f2 TL |
616 | map<uint64_t, string> _live_ids; |
617 | map<string, uint64_t> _tag_counts; | |
618 | uint64_t _ref_id = 0; | |
619 | ||
620 | friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); } | |
621 | friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); } | |
622 | #endif | |
623 | ||
624 | private: | |
625 | friend void intrusive_ptr_add_ref(PG *pg) { | |
626 | pg->get("intptr"); | |
627 | } | |
628 | friend void intrusive_ptr_release(PG *pg) { | |
629 | pg->put("intptr"); | |
630 | } | |
631 | ||
632 | ||
633 | // ===================== | |
634 | ||
635 | protected: | |
7c673cae FG |
636 | OSDriver osdriver; |
637 | SnapMapper snap_mapper; | |
224ce89b | 638 | bool eio_errors_to_process = false; |
7c673cae FG |
639 | |
640 | virtual PGBackend *get_pgbackend() = 0; | |
11fdf7f2 | 641 | virtual const PGBackend* get_pgbackend() const = 0; |
7c673cae | 642 | |
11fdf7f2 | 643 | protected: |
7c673cae FG |
644 | void requeue_map_waiters(); |
645 | ||
7c673cae FG |
646 | protected: |
647 | ||
7c673cae FG |
648 | ZTracer::Endpoint trace_endpoint; |
649 | ||
7c673cae | 650 | |
11fdf7f2 | 651 | protected: |
11fdf7f2 | 652 | __u8 info_struct_v = 0; |
7c673cae FG |
653 | void upgrade(ObjectStore *store); |
654 | ||
11fdf7f2 | 655 | protected: |
7c673cae FG |
656 | ghobject_t pgmeta_oid; |
657 | ||
91327a77 | 658 | // ------------------ |
7c673cae | 659 | interval_set<snapid_t> snap_trimq; |
9f95a23c | 660 | set<snapid_t> snap_trimq_repeat; |
7c673cae FG |
661 | |
662 | /* You should not use these items without taking their respective queue locks | |
663 | * (if they have one) */ | |
664 | xlist<PG*>::item stat_queue_item; | |
665 | bool scrub_queued; | |
666 | bool recovery_queued; | |
667 | ||
668 | int recovery_ops_active; | |
669 | set<pg_shard_t> waiting_on_backfill; | |
670 | #ifdef DEBUG_RECOVERY_OIDS | |
11fdf7f2 | 671 | multiset<hobject_t> recovering_oids; |
7c673cae FG |
672 | #endif |
673 | ||
94b18763 FG |
674 | public: |
675 | bool dne() { return info.dne(); } | |
7c673cae | 676 | |
9f95a23c TL |
677 | virtual void send_cluster_message( |
678 | int osd, Message *m, epoch_t epoch, bool share_map_update) override; | |
7c673cae FG |
679 | |
680 | protected: | |
11fdf7f2 | 681 | epoch_t get_last_peering_reset() const { |
9f95a23c | 682 | return recovery_state.get_last_peering_reset(); |
11fdf7f2 | 683 | } |
7c673cae FG |
684 | |
685 | /* heartbeat peers */ | |
9f95a23c TL |
686 | void set_probe_targets(const set<pg_shard_t> &probe_set) override; |
687 | void clear_probe_targets() override; | |
11fdf7f2 | 688 | |
9f95a23c TL |
689 | ceph::mutex heartbeat_peer_lock = |
690 | ceph::make_mutex("PG::heartbeat_peer_lock"); | |
7c673cae FG |
691 | set<int> heartbeat_peers; |
692 | set<int> probe_targets; | |
693 | ||
11fdf7f2 | 694 | public: |
7c673cae FG |
695 | /** |
696 | * BackfillInterval | |
697 | * | |
698 | * Represents the objects in a range [begin, end) | |
699 | * | |
700 | * Possible states: | |
701 | * 1) begin == end == hobject_t() indicates the the interval is unpopulated | |
702 | * 2) Else, objects contains all objects in [begin, end) | |
703 | */ | |
704 | struct BackfillInterval { | |
705 | // info about a backfill interval on a peer | |
706 | eversion_t version; /// version at which the scan occurred | |
707 | map<hobject_t,eversion_t> objects; | |
708 | hobject_t begin; | |
709 | hobject_t end; | |
710 | ||
711 | /// clear content | |
712 | void clear() { | |
713 | *this = BackfillInterval(); | |
714 | } | |
715 | ||
716 | /// clear objects list only | |
717 | void clear_objects() { | |
718 | objects.clear(); | |
719 | } | |
720 | ||
721 | /// reinstantiate with a new start+end position and sort order | |
722 | void reset(hobject_t start) { | |
723 | clear(); | |
724 | begin = end = start; | |
725 | } | |
726 | ||
727 | /// true if there are no objects in this interval | |
728 | bool empty() const { | |
729 | return objects.empty(); | |
730 | } | |
731 | ||
732 | /// true if interval extends to the end of the range | |
733 | bool extends_to_end() const { | |
734 | return end.is_max(); | |
735 | } | |
736 | ||
737 | /// removes items <= soid and adjusts begin to the first object | |
738 | void trim_to(const hobject_t &soid) { | |
739 | trim(); | |
740 | while (!objects.empty() && | |
741 | objects.begin()->first <= soid) { | |
742 | pop_front(); | |
743 | } | |
744 | } | |
745 | ||
746 | /// Adjusts begin to the first object | |
747 | void trim() { | |
748 | if (!objects.empty()) | |
749 | begin = objects.begin()->first; | |
750 | else | |
751 | begin = end; | |
752 | } | |
753 | ||
754 | /// drop first entry, and adjust @begin accordingly | |
755 | void pop_front() { | |
11fdf7f2 | 756 | ceph_assert(!objects.empty()); |
7c673cae FG |
757 | objects.erase(objects.begin()); |
758 | trim(); | |
759 | } | |
760 | ||
761 | /// dump | |
762 | void dump(Formatter *f) const { | |
763 | f->dump_stream("begin") << begin; | |
764 | f->dump_stream("end") << end; | |
765 | f->open_array_section("objects"); | |
766 | for (map<hobject_t, eversion_t>::const_iterator i = | |
767 | objects.begin(); | |
768 | i != objects.end(); | |
769 | ++i) { | |
770 | f->open_object_section("object"); | |
771 | f->dump_stream("object") << i->first; | |
772 | f->dump_stream("version") << i->second; | |
773 | f->close_section(); | |
774 | } | |
775 | f->close_section(); | |
776 | } | |
777 | }; | |
778 | ||
779 | protected: | |
780 | BackfillInterval backfill_info; | |
781 | map<pg_shard_t, BackfillInterval> peer_backfill_info; | |
7c673cae FG |
782 | bool backfill_reserving; |
783 | ||
11fdf7f2 TL |
784 | // The primary's num_bytes and local num_bytes for this pg, only valid |
785 | // during backfill for non-primary shards. | |
786 | // Both of these are adjusted for EC to reflect the on-disk bytes | |
787 | std::atomic<int64_t> primary_num_bytes = 0; | |
788 | std::atomic<int64_t> local_num_bytes = 0; | |
7c673cae | 789 | |
11fdf7f2 | 790 | public: |
11fdf7f2 TL |
791 | // Space reserved for backfill is primary_num_bytes - local_num_bytes |
792 | // Don't care that difference itself isn't atomic | |
793 | uint64_t get_reserved_num_bytes() { | |
794 | int64_t primary = primary_num_bytes.load(); | |
795 | int64_t local = local_num_bytes.load(); | |
796 | if (primary > local) | |
797 | return primary - local; | |
798 | else | |
799 | return 0; | |
800 | } | |
801 | ||
802 | bool is_remote_backfilling() { | |
803 | return primary_num_bytes.load() > 0; | |
804 | } | |
805 | ||
9f95a23c TL |
806 | bool try_reserve_recovery_space(int64_t primary, int64_t local) override; |
807 | void unreserve_recovery_space() override; | |
11fdf7f2 TL |
808 | |
809 | // If num_bytes are inconsistent and local_num- goes negative | |
810 | // it's ok, because it would then be ignored. | |
811 | ||
812 | // The value of num_bytes could be negative, | |
813 | // but we don't let local_num_bytes go negative. | |
814 | void add_local_num_bytes(int64_t num_bytes) { | |
815 | if (num_bytes) { | |
816 | int64_t prev_bytes = local_num_bytes.load(); | |
817 | int64_t new_bytes; | |
818 | do { | |
819 | new_bytes = prev_bytes + num_bytes; | |
820 | if (new_bytes < 0) | |
821 | new_bytes = 0; | |
822 | } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); | |
823 | } | |
824 | } | |
825 | void sub_local_num_bytes(int64_t num_bytes) { | |
826 | ceph_assert(num_bytes >= 0); | |
827 | if (num_bytes) { | |
828 | int64_t prev_bytes = local_num_bytes.load(); | |
829 | int64_t new_bytes; | |
830 | do { | |
831 | new_bytes = prev_bytes - num_bytes; | |
832 | if (new_bytes < 0) | |
833 | new_bytes = 0; | |
834 | } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes)); | |
835 | } | |
836 | } | |
837 | // The value of num_bytes could be negative, | |
838 | // but we don't let info.stats.stats.sum.num_bytes go negative. | |
839 | void add_num_bytes(int64_t num_bytes) { | |
9f95a23c | 840 | ceph_assert(ceph_mutex_is_locked_by_me(_lock)); |
11fdf7f2 | 841 | if (num_bytes) { |
9f95a23c TL |
842 | recovery_state.update_stats( |
843 | [num_bytes](auto &history, auto &stats) { | |
844 | stats.stats.sum.num_bytes += num_bytes; | |
845 | if (stats.stats.sum.num_bytes < 0) { | |
846 | stats.stats.sum.num_bytes = 0; | |
847 | } | |
848 | return false; | |
849 | }); | |
11fdf7f2 TL |
850 | } |
851 | } | |
852 | void sub_num_bytes(int64_t num_bytes) { | |
9f95a23c | 853 | ceph_assert(ceph_mutex_is_locked_by_me(_lock)); |
11fdf7f2 TL |
854 | ceph_assert(num_bytes >= 0); |
855 | if (num_bytes) { | |
9f95a23c TL |
856 | recovery_state.update_stats( |
857 | [num_bytes](auto &history, auto &stats) { | |
858 | stats.stats.sum.num_bytes -= num_bytes; | |
859 | if (stats.stats.sum.num_bytes < 0) { | |
860 | stats.stats.sum.num_bytes = 0; | |
861 | } | |
862 | return false; | |
863 | }); | |
11fdf7f2 TL |
864 | } |
865 | } | |
866 | ||
867 | // Only used in testing so not worried about needing the PG lock here | |
868 | int64_t get_stats_num_bytes() { | |
9f95a23c | 869 | std::lock_guard l{_lock}; |
11fdf7f2 TL |
870 | int num_bytes = info.stats.stats.sum.num_bytes; |
871 | if (pool.info.is_erasure()) { | |
872 | num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count(); | |
873 | // Round up each object by a stripe | |
874 | num_bytes += get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects; | |
875 | } | |
876 | int64_t lnb = local_num_bytes.load(); | |
877 | if (lnb && lnb != num_bytes) { | |
878 | lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch " | |
879 | << lnb << " vs stats " | |
880 | << info.stats.stats.sum.num_bytes << " / chunk " | |
881 | << get_pgbackend()->get_ec_data_chunk_count() | |
882 | << dendl; | |
883 | } | |
884 | return num_bytes; | |
885 | } | |
886 | ||
7c673cae FG |
887 | protected: |
888 | ||
889 | /* | |
890 | * blocked request wait hierarchy | |
891 | * | |
892 | * In order to preserve request ordering we need to be careful about the | |
893 | * order in which blocked requests get requeued. Generally speaking, we | |
894 | * push the requests back up to the op_wq in reverse order (most recent | |
895 | * request first) so that they come back out again in the original order. | |
896 | * However, because there are multiple wait queues, we need to requeue | |
897 | * waitlists in order. Generally speaking, we requeue the wait lists | |
898 | * that are checked first. | |
899 | * | |
900 | * Here are the various wait lists, in the order they are used during | |
901 | * request processing, with notes: | |
902 | * | |
903 | * - waiting_for_map | |
904 | * - may start or stop blocking at any time (depending on client epoch) | |
905 | * - waiting_for_peered | |
9f95a23c | 906 | * - !is_peered() |
7c673cae | 907 | * - only starts blocking on interval change; never restarts |
9f95a23c TL |
908 | * - waiting_for_flush |
909 | * - flushes_in_progress | |
910 | * - waiting for final flush during activate | |
7c673cae FG |
911 | * - waiting_for_active |
912 | * - !is_active() | |
913 | * - only starts blocking on interval change; never restarts | |
9f95a23c TL |
914 | * - waiting_for_readable |
915 | * - now > readable_until | |
916 | * - unblocks when we get fresh(er) osd_pings | |
7c673cae FG |
917 | * - waiting_for_scrub |
918 | * - starts and stops blocking for varying intervals during scrub | |
919 | * - waiting_for_unreadable_object | |
920 | * - never restarts once object is readable (* except for EIO?) | |
921 | * - waiting_for_degraded_object | |
922 | * - never restarts once object is writeable (* except for EIO?) | |
923 | * - waiting_for_blocked_object | |
924 | * - starts and stops based on proxied op activity | |
925 | * - obc rwlocks | |
926 | * - starts and stops based on read/write activity | |
927 | * | |
928 | * Notes: | |
929 | * | |
930 | * 1. During and interval change, we requeue *everything* in the above order. | |
931 | * | |
932 | * 2. When an obc rwlock is released, we check for a scrub block and requeue | |
933 | * the op there if it applies. We ignore the unreadable/degraded/blocked | |
934 | * queues because we assume they cannot apply at that time (this is | |
935 | * probably mostly true). | |
936 | * | |
937 | * 3. The requeue_ops helper will push ops onto the waiting_for_map list if | |
938 | * it is non-empty. | |
939 | * | |
940 | * These three behaviors are generally sufficient to maintain ordering, with | |
941 | * the possible exception of cases where we make an object degraded or | |
942 | * unreadable that was previously okay, e.g. when scrub or op processing | |
943 | * encounter an unexpected error. FIXME. | |
944 | */ | |
945 | ||
7c673cae FG |
946 | // ops with newer maps than our (or blocked behind them) |
947 | // track these by client, since inter-request ordering doesn't otherwise | |
948 | // matter. | |
949 | unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map; | |
950 | ||
951 | // ops waiting on peered | |
952 | list<OpRequestRef> waiting_for_peered; | |
953 | ||
9f95a23c TL |
954 | /// ops waiting on readble |
955 | list<OpRequestRef> waiting_for_readable; | |
956 | ||
7c673cae FG |
957 | // ops waiting on active (require peered as well) |
958 | list<OpRequestRef> waiting_for_active; | |
b32b8144 | 959 | list<OpRequestRef> waiting_for_flush; |
7c673cae FG |
960 | list<OpRequestRef> waiting_for_scrub; |
961 | ||
962 | list<OpRequestRef> waiting_for_cache_not_full; | |
224ce89b | 963 | list<OpRequestRef> waiting_for_clean_to_primary_repair; |
7c673cae FG |
964 | map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object, |
965 | waiting_for_degraded_object, | |
966 | waiting_for_blocked_object; | |
967 | ||
968 | set<hobject_t> objects_blocked_on_cache_full; | |
969 | map<hobject_t,snapid_t> objects_blocked_on_degraded_snap; | |
970 | map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion; | |
971 | ||
972 | // Callbacks should assume pg (and nothing else) is locked | |
973 | map<hobject_t, list<Context*>> callbacks_for_degraded_object; | |
974 | ||
975 | map<eversion_t, | |
9f95a23c TL |
976 | list< |
977 | tuple<OpRequestRef, version_t, int, | |
978 | vector<pg_log_op_return_item_t>>>> waiting_for_ondisk; | |
7c673cae FG |
979 | |
980 | void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m); | |
981 | void requeue_op(OpRequestRef op); | |
982 | void requeue_ops(list<OpRequestRef> &l); | |
983 | ||
984 | // stats that persist lazily | |
985 | object_stat_collection_t unstable_stats; | |
986 | ||
987 | // publish stats | |
9f95a23c TL |
988 | ceph::mutex pg_stats_publish_lock = |
989 | ceph::make_mutex("PG::pg_stats_publish_lock"); | |
7c673cae FG |
990 | bool pg_stats_publish_valid; |
991 | pg_stat_t pg_stats_publish; | |
992 | ||
a8e16298 | 993 | friend class TestOpsSocketHook; |
9f95a23c | 994 | void publish_stats_to_osd() override; |
7c673cae | 995 | |
9f95a23c TL |
996 | bool needs_recovery() const { |
997 | return recovery_state.needs_recovery(); | |
7c673cae | 998 | } |
9f95a23c TL |
999 | bool needs_backfill() const { |
1000 | return recovery_state.needs_backfill(); | |
7c673cae | 1001 | } |
7c673cae FG |
1002 | |
1003 | bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const; | |
7c673cae FG |
1004 | |
1005 | struct PGLogEntryHandler : public PGLog::LogEntryHandler { | |
1006 | PG *pg; | |
1007 | ObjectStore::Transaction *t; | |
1008 | PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {} | |
1009 | ||
1010 | // LogEntryHandler | |
1011 | void remove(const hobject_t &hoid) override { | |
1012 | pg->get_pgbackend()->remove(hoid, t); | |
1013 | } | |
1014 | void try_stash(const hobject_t &hoid, version_t v) override { | |
1015 | pg->get_pgbackend()->try_stash(hoid, v, t); | |
1016 | } | |
1017 | void rollback(const pg_log_entry_t &entry) override { | |
11fdf7f2 | 1018 | ceph_assert(entry.can_rollback()); |
7c673cae FG |
1019 | pg->get_pgbackend()->rollback(entry, t); |
1020 | } | |
1021 | void rollforward(const pg_log_entry_t &entry) override { | |
1022 | pg->get_pgbackend()->rollforward(entry, t); | |
1023 | } | |
1024 | void trim(const pg_log_entry_t &entry) override { | |
1025 | pg->get_pgbackend()->trim(entry, t); | |
1026 | } | |
1027 | }; | |
1028 | ||
1029 | void update_object_snap_mapping( | |
1030 | ObjectStore::Transaction *t, const hobject_t &soid, | |
1031 | const set<snapid_t> &snaps); | |
1032 | void clear_object_snap_mapping( | |
1033 | ObjectStore::Transaction *t, const hobject_t &soid); | |
1034 | void remove_snap_mapped_object( | |
1035 | ObjectStore::Transaction& t, const hobject_t& soid); | |
7c673cae FG |
1036 | |
1037 | bool have_unfound() const { | |
9f95a23c | 1038 | return recovery_state.have_unfound(); |
7c673cae FG |
1039 | } |
1040 | uint64_t get_num_unfound() const { | |
9f95a23c | 1041 | return recovery_state.get_num_unfound(); |
81eedcae | 1042 | } |
7c673cae FG |
1043 | |
1044 | virtual void check_local() = 0; | |
1045 | ||
7c673cae FG |
1046 | void purge_strays(); |
1047 | ||
9f95a23c | 1048 | void update_heartbeat_peers(set<int> peers) override; |
7c673cae FG |
1049 | |
1050 | Context *finish_sync_event; | |
1051 | ||
11fdf7f2 | 1052 | Context *finish_recovery(); |
7c673cae | 1053 | void _finish_recovery(Context *c); |
11fdf7f2 TL |
1054 | struct C_PG_FinishRecovery : public Context { |
1055 | PGRef pg; | |
1056 | explicit C_PG_FinishRecovery(PG *p) : pg(p) {} | |
1057 | void finish(int r) override { | |
1058 | pg->_finish_recovery(this); | |
1059 | } | |
1060 | }; | |
7c673cae FG |
1061 | void cancel_recovery(); |
1062 | void clear_recovery_state(); | |
1063 | virtual void _clear_recovery_state() = 0; | |
7c673cae FG |
1064 | void start_recovery_op(const hobject_t& soid); |
1065 | void finish_recovery_op(const hobject_t& soid, bool dequeue=false); | |
1066 | ||
7c673cae FG |
1067 | virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0; |
1068 | ||
1069 | friend class C_OSD_RepModify_Commit; | |
11fdf7f2 | 1070 | friend class C_DeleteMore; |
7c673cae FG |
1071 | |
1072 | // -- backoff -- | |
9f95a23c TL |
1073 | ceph::mutex backoff_lock = // orders inside Backoff::lock |
1074 | ceph::make_mutex("PG::backoff_lock"); | |
1075 | map<hobject_t,set<ceph::ref_t<Backoff>>> backoffs; | |
7c673cae | 1076 | |
9f95a23c | 1077 | void add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end); |
7c673cae FG |
1078 | void release_backoffs(const hobject_t& begin, const hobject_t& end); |
1079 | void release_backoffs(const hobject_t& o) { | |
1080 | release_backoffs(o, o); | |
1081 | } | |
1082 | void clear_backoffs(); | |
1083 | ||
9f95a23c | 1084 | void add_pg_backoff(const ceph::ref_t<Session>& s) { |
7c673cae FG |
1085 | hobject_t begin = info.pgid.pgid.get_hobj_start(); |
1086 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1087 | add_backoff(s, begin, end); | |
1088 | } | |
eafe8130 | 1089 | public: |
7c673cae FG |
1090 | void release_pg_backoffs() { |
1091 | hobject_t begin = info.pgid.pgid.get_hobj_start(); | |
1092 | hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num()); | |
1093 | release_backoffs(begin, end); | |
1094 | } | |
eafe8130 | 1095 | protected: |
7c673cae | 1096 | |
7c673cae | 1097 | // -- scrub -- |
11fdf7f2 | 1098 | public: |
7c673cae FG |
1099 | struct Scrubber { |
1100 | Scrubber(); | |
1101 | ~Scrubber(); | |
1102 | ||
1103 | // metadata | |
1104 | set<pg_shard_t> reserved_peers; | |
eafe8130 | 1105 | bool local_reserved, remote_reserved, reserve_failed; |
7c673cae FG |
1106 | epoch_t epoch_start; |
1107 | ||
1108 | // common to both scrubs | |
1109 | bool active; | |
7c673cae FG |
1110 | set<pg_shard_t> waiting_on_whom; |
1111 | int shallow_errors; | |
1112 | int deep_errors; | |
1113 | int fixed; | |
1114 | ScrubMap primary_scrubmap; | |
28e407b8 AA |
1115 | ScrubMapBuilder primary_scrubmap_pos; |
1116 | epoch_t replica_scrub_start = 0; | |
1117 | ScrubMap replica_scrubmap; | |
1118 | ScrubMapBuilder replica_scrubmap_pos; | |
7c673cae FG |
1119 | map<pg_shard_t, ScrubMap> received_maps; |
1120 | OpRequestRef active_rep_scrub; | |
1121 | utime_t scrub_reg_stamp; // stamp we registered for | |
1122 | ||
494da23a TL |
1123 | static utime_t scrub_must_stamp() { return utime_t(0,1); } |
1124 | ||
11fdf7f2 TL |
1125 | omap_stat_t omap_stats = (const struct omap_stat_t){ 0 }; |
1126 | ||
7c673cae FG |
1127 | // For async sleep |
1128 | bool sleeping = false; | |
1129 | bool needs_sleep = true; | |
1130 | utime_t sleep_start; | |
1131 | ||
1132 | // flags to indicate explicitly requested scrubs (by admin) | |
494da23a | 1133 | bool must_scrub, must_deep_scrub, must_repair, need_auto; |
7c673cae FG |
1134 | |
1135 | // Priority to use for scrub scheduling | |
11fdf7f2 | 1136 | unsigned priority = 0; |
7c673cae | 1137 | |
494da23a | 1138 | bool time_for_deep; |
7c673cae FG |
1139 | // this flag indicates whether we would like to do auto-repair of the PG or not |
1140 | bool auto_repair; | |
11fdf7f2 TL |
1141 | // this flag indicates that we are scrubbing post repair to verify everything is fixed |
1142 | bool check_repair; | |
1143 | // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors, | |
1144 | // we should deep scrub in order to auto repair | |
1145 | bool deep_scrub_on_error; | |
7c673cae FG |
1146 | |
1147 | // Maps from objects with errors to missing/inconsistent peers | |
1148 | map<hobject_t, set<pg_shard_t>> missing; | |
1149 | map<hobject_t, set<pg_shard_t>> inconsistent; | |
1150 | ||
1151 | // Map from object with errors to good peers | |
1152 | map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative; | |
1153 | ||
1154 | // Cleaned map pending snap metadata scrub | |
1155 | ScrubMap cleaned_meta_map; | |
1156 | ||
28e407b8 AA |
1157 | void clean_meta_map(ScrubMap &for_meta_scrub) { |
1158 | if (end.is_max() || | |
1159 | cleaned_meta_map.objects.empty()) { | |
1160 | cleaned_meta_map.swap(for_meta_scrub); | |
1161 | } else { | |
1162 | auto iter = cleaned_meta_map.objects.end(); | |
1163 | --iter; // not empty, see if clause | |
1164 | auto begin = cleaned_meta_map.objects.begin(); | |
1165 | if (iter->first.has_snapset()) { | |
1166 | ++iter; | |
1167 | } else { | |
1168 | while (iter != begin) { | |
1169 | auto next = iter--; | |
1170 | if (next->first.get_head() != iter->first.get_head()) { | |
1171 | ++iter; | |
1172 | break; | |
1173 | } | |
1174 | } | |
1175 | } | |
1176 | for_meta_scrub.objects.insert(begin, iter); | |
1177 | cleaned_meta_map.objects.erase(begin, iter); | |
1178 | } | |
1179 | } | |
1180 | ||
7c673cae FG |
1181 | // digest updates which we are waiting on |
1182 | int num_digest_updates_pending; | |
1183 | ||
1184 | // chunky scrub | |
28e407b8 AA |
1185 | hobject_t start, end; // [start,end) |
1186 | hobject_t max_end; // Largest end that may have been sent to replicas | |
7c673cae FG |
1187 | eversion_t subset_last_update; |
1188 | ||
1189 | // chunky scrub state | |
1190 | enum State { | |
1191 | INACTIVE, | |
1192 | NEW_CHUNK, | |
1193 | WAIT_PUSHES, | |
1194 | WAIT_LAST_UPDATE, | |
1195 | BUILD_MAP, | |
28e407b8 | 1196 | BUILD_MAP_DONE, |
7c673cae FG |
1197 | WAIT_REPLICAS, |
1198 | COMPARE_MAPS, | |
1199 | WAIT_DIGEST_UPDATES, | |
1200 | FINISH, | |
28e407b8 | 1201 | BUILD_MAP_REPLICA, |
7c673cae FG |
1202 | } state; |
1203 | ||
1204 | std::unique_ptr<Scrub::Store> store; | |
1205 | // deep scrub | |
1206 | bool deep; | |
28e407b8 AA |
1207 | int preempt_left; |
1208 | int preempt_divisor; | |
7c673cae FG |
1209 | |
1210 | list<Context*> callbacks; | |
1211 | void add_callback(Context *context) { | |
1212 | callbacks.push_back(context); | |
1213 | } | |
1214 | void run_callbacks() { | |
1215 | list<Context*> to_run; | |
1216 | to_run.swap(callbacks); | |
1217 | for (list<Context*>::iterator i = to_run.begin(); | |
1218 | i != to_run.end(); | |
1219 | ++i) { | |
1220 | (*i)->complete(0); | |
1221 | } | |
1222 | } | |
1223 | ||
1224 | static const char *state_string(const PG::Scrubber::State& state) { | |
1225 | const char *ret = NULL; | |
1226 | switch( state ) | |
1227 | { | |
1228 | case INACTIVE: ret = "INACTIVE"; break; | |
1229 | case NEW_CHUNK: ret = "NEW_CHUNK"; break; | |
1230 | case WAIT_PUSHES: ret = "WAIT_PUSHES"; break; | |
1231 | case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break; | |
1232 | case BUILD_MAP: ret = "BUILD_MAP"; break; | |
28e407b8 | 1233 | case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break; |
7c673cae FG |
1234 | case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break; |
1235 | case COMPARE_MAPS: ret = "COMPARE_MAPS"; break; | |
1236 | case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break; | |
1237 | case FINISH: ret = "FINISH"; break; | |
28e407b8 | 1238 | case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break; |
7c673cae FG |
1239 | } |
1240 | return ret; | |
1241 | } | |
1242 | ||
1243 | bool is_chunky_scrub_active() const { return state != INACTIVE; } | |
1244 | ||
7c673cae FG |
1245 | // clear all state |
1246 | void reset() { | |
1247 | active = false; | |
7c673cae FG |
1248 | waiting_on_whom.clear(); |
1249 | if (active_rep_scrub) { | |
1250 | active_rep_scrub = OpRequestRef(); | |
1251 | } | |
1252 | received_maps.clear(); | |
1253 | ||
1254 | must_scrub = false; | |
1255 | must_deep_scrub = false; | |
1256 | must_repair = false; | |
494da23a TL |
1257 | need_auto = false; |
1258 | time_for_deep = false; | |
7c673cae | 1259 | auto_repair = false; |
11fdf7f2 TL |
1260 | check_repair = false; |
1261 | deep_scrub_on_error = false; | |
7c673cae FG |
1262 | |
1263 | state = PG::Scrubber::INACTIVE; | |
1264 | start = hobject_t(); | |
1265 | end = hobject_t(); | |
28e407b8 | 1266 | max_end = hobject_t(); |
7c673cae FG |
1267 | subset_last_update = eversion_t(); |
1268 | shallow_errors = 0; | |
1269 | deep_errors = 0; | |
1270 | fixed = 0; | |
11fdf7f2 | 1271 | omap_stats = (const struct omap_stat_t){ 0 }; |
7c673cae | 1272 | deep = false; |
7c673cae FG |
1273 | run_callbacks(); |
1274 | inconsistent.clear(); | |
1275 | missing.clear(); | |
1276 | authoritative.clear(); | |
1277 | num_digest_updates_pending = 0; | |
28e407b8 AA |
1278 | primary_scrubmap = ScrubMap(); |
1279 | primary_scrubmap_pos.reset(); | |
1280 | replica_scrubmap = ScrubMap(); | |
1281 | replica_scrubmap_pos.reset(); | |
7c673cae FG |
1282 | cleaned_meta_map = ScrubMap(); |
1283 | sleeping = false; | |
1284 | needs_sleep = true; | |
1285 | sleep_start = utime_t(); | |
1286 | } | |
1287 | ||
1288 | void create_results(const hobject_t& obj); | |
1289 | void cleanup_store(ObjectStore::Transaction *t); | |
1290 | } scrubber; | |
1291 | ||
11fdf7f2 | 1292 | protected: |
7c673cae FG |
1293 | bool scrub_after_recovery; |
1294 | ||
1295 | int active_pushes; | |
1296 | ||
28e407b8 AA |
1297 | bool scrub_can_preempt = false; |
1298 | bool scrub_preempted = false; | |
1299 | ||
1300 | // we allow some number of preemptions of the scrub, which mean we do | |
1301 | // not block. then we start to block. once we start blocking, we do | |
1302 | // not stop until the scrub range is completed. | |
1303 | bool write_blocked_by_scrub(const hobject_t &soid); | |
1304 | ||
1305 | /// true if the given range intersects the scrub interval in any way | |
1306 | bool range_intersects_scrub(const hobject_t &start, const hobject_t& end); | |
1307 | ||
7c673cae | 1308 | void repair_object( |
9f95a23c TL |
1309 | const hobject_t &soid, |
1310 | const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers, | |
1311 | const set<pg_shard_t> &bad_peers); | |
7c673cae | 1312 | |
7c673cae FG |
1313 | void chunky_scrub(ThreadPool::TPHandle &handle); |
1314 | void scrub_compare_maps(); | |
1315 | /** | |
1316 | * return true if any inconsistency/missing is repaired, false otherwise | |
1317 | */ | |
1318 | bool scrub_process_inconsistent(); | |
31f18b77 | 1319 | bool ops_blocked_by_scrub() const; |
7c673cae | 1320 | void scrub_finish(); |
11fdf7f2 | 1321 | void scrub_clear_state(bool keep_repair = false); |
7c673cae | 1322 | void _scan_snaps(ScrubMap &map); |
224ce89b | 1323 | void _repair_oinfo_oid(ScrubMap &map); |
11fdf7f2 | 1324 | void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs); |
7c673cae FG |
1325 | void _request_scrub_map(pg_shard_t replica, eversion_t version, |
1326 | hobject_t start, hobject_t end, bool deep, | |
28e407b8 | 1327 | bool allow_preemption); |
7c673cae FG |
1328 | int build_scrub_map_chunk( |
1329 | ScrubMap &map, | |
28e407b8 AA |
1330 | ScrubMapBuilder &pos, |
1331 | hobject_t start, hobject_t end, bool deep, | |
7c673cae FG |
1332 | ThreadPool::TPHandle &handle); |
1333 | /** | |
1334 | * returns true if [begin, end) is good to scrub at this time | |
1335 | * a false return value obliges the implementer to requeue scrub when the | |
1336 | * condition preventing scrub clears | |
1337 | */ | |
1338 | virtual bool _range_available_for_scrub( | |
1339 | const hobject_t &begin, const hobject_t &end) = 0; | |
1340 | virtual void scrub_snapshot_metadata( | |
1341 | ScrubMap &map, | |
28e407b8 | 1342 | const std::map<hobject_t, |
9f95a23c TL |
1343 | pair<std::optional<uint32_t>, |
1344 | std::optional<uint32_t>>> &missing_digest) { } | |
7c673cae FG |
1345 | virtual void _scrub_clear_state() { } |
1346 | virtual void _scrub_finish() { } | |
7c673cae FG |
1347 | void clear_scrub_reserved(); |
1348 | void scrub_reserve_replicas(); | |
1349 | void scrub_unreserve_replicas(); | |
1350 | bool scrub_all_replicas_reserved() const; | |
7c673cae FG |
1351 | |
1352 | void replica_scrub( | |
1353 | OpRequestRef op, | |
1354 | ThreadPool::TPHandle &handle); | |
1355 | void do_replica_scrub_map(OpRequestRef op); | |
7c673cae FG |
1356 | |
1357 | void handle_scrub_reserve_request(OpRequestRef op); | |
1358 | void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from); | |
1359 | void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from); | |
1360 | void handle_scrub_reserve_release(OpRequestRef op); | |
1361 | ||
7c673cae FG |
1362 | // -- recovery state -- |
1363 | ||
7c673cae FG |
1364 | struct QueuePeeringEvt : Context { |
1365 | PGRef pg; | |
9f95a23c TL |
1366 | PGPeeringEventRef evt; |
1367 | ||
1368 | template <class EVT> | |
7c673cae | 1369 | QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) : |
9f95a23c TL |
1370 | pg(pg), evt(std::make_shared<PGPeeringEvent>(epoch, epoch, evt)) {} |
1371 | ||
1372 | QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) : | |
1373 | pg(pg), evt(std::move(evt)) {} | |
1374 | ||
7c673cae FG |
1375 | void finish(int r) override { |
1376 | pg->lock(); | |
9f95a23c | 1377 | pg->queue_peering_event(std::move(evt)); |
7c673cae FG |
1378 | pg->unlock(); |
1379 | } | |
1380 | }; | |
1381 | ||
7c673cae | 1382 | |
11fdf7f2 TL |
1383 | public: |
1384 | int pg_stat_adjust(osd_stat_t *new_stat); | |
1385 | protected: | |
11fdf7f2 | 1386 | bool delete_needs_sleep = false; |
a8e16298 | 1387 | |
11fdf7f2 | 1388 | protected: |
9f95a23c TL |
1389 | bool state_test(uint64_t m) const { return recovery_state.state_test(m); } |
1390 | void state_set(uint64_t m) { recovery_state.state_set(m); } | |
1391 | void state_clear(uint64_t m) { recovery_state.state_clear(m); } | |
1392 | ||
1393 | bool is_complete() const { | |
1394 | return recovery_state.is_complete(); | |
1395 | } | |
1396 | bool should_send_notify() const { | |
1397 | return recovery_state.should_send_notify(); | |
1398 | } | |
1399 | ||
1400 | bool is_active() const { return recovery_state.is_active(); } | |
1401 | bool is_activating() const { return recovery_state.is_activating(); } | |
1402 | bool is_peering() const { return recovery_state.is_peering(); } | |
1403 | bool is_down() const { return recovery_state.is_down(); } | |
1404 | bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); } | |
1405 | bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); } | |
1406 | bool is_incomplete() const { return recovery_state.is_incomplete(); } | |
1407 | bool is_clean() const { return recovery_state.is_clean(); } | |
1408 | bool is_degraded() const { return recovery_state.is_degraded(); } | |
1409 | bool is_undersized() const { return recovery_state.is_undersized(); } | |
11fdf7f2 | 1410 | bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); } |
9f95a23c TL |
1411 | bool is_remapped() const { return recovery_state.is_remapped(); } |
1412 | bool is_peered() const { return recovery_state.is_peered(); } | |
1413 | bool is_recovering() const { return recovery_state.is_recovering(); } | |
1414 | bool is_premerge() const { return recovery_state.is_premerge(); } | |
1415 | bool is_repair() const { return recovery_state.is_repair(); } | |
1416 | bool is_laggy() const { return state_test(PG_STATE_LAGGY); } | |
1417 | bool is_wait() const { return state_test(PG_STATE_WAIT); } | |
7c673cae | 1418 | |
9f95a23c | 1419 | bool is_empty() const { return recovery_state.is_empty(); } |
7c673cae FG |
1420 | |
1421 | // pg on-disk state | |
1422 | void do_pending_flush(); | |
1423 | ||
11fdf7f2 | 1424 | public: |
9f95a23c | 1425 | virtual void prepare_write( |
7c673cae FG |
1426 | pg_info_t &info, |
1427 | pg_info_t &last_written_info, | |
1428 | PastIntervals &past_intervals, | |
9f95a23c TL |
1429 | PGLog &pglog, |
1430 | bool dirty_info, | |
7c673cae | 1431 | bool dirty_big_info, |
9f95a23c TL |
1432 | bool need_write_epoch, |
1433 | ObjectStore::Transaction &t) override; | |
11fdf7f2 | 1434 | |
9f95a23c TL |
1435 | void write_if_dirty(PeeringCtx &rctx) { |
1436 | write_if_dirty(rctx.transaction); | |
11fdf7f2 TL |
1437 | } |
1438 | protected: | |
9f95a23c TL |
1439 | void write_if_dirty(ObjectStore::Transaction& t) { |
1440 | recovery_state.write_if_dirty(t); | |
1441 | } | |
7c673cae FG |
1442 | |
1443 | PGLog::IndexedLog projected_log; | |
1444 | bool check_in_progress_op( | |
1445 | const osd_reqid_t &r, | |
1446 | eversion_t *version, | |
1447 | version_t *user_version, | |
9f95a23c TL |
1448 | int *return_code, |
1449 | vector<pg_log_op_return_item_t> *op_returns) const; | |
7c673cae FG |
1450 | eversion_t projected_last_update; |
1451 | eversion_t get_next_version() const { | |
1452 | eversion_t at_version( | |
11fdf7f2 | 1453 | get_osdmap_epoch(), |
7c673cae | 1454 | projected_last_update.version+1); |
11fdf7f2 | 1455 | ceph_assert(at_version > info.last_update); |
9f95a23c | 1456 | ceph_assert(at_version > recovery_state.get_pg_log().get_head()); |
11fdf7f2 | 1457 | ceph_assert(at_version > projected_last_update); |
7c673cae FG |
1458 | return at_version; |
1459 | } | |
1460 | ||
7c673cae | 1461 | bool check_log_for_corruption(ObjectStore *store); |
7c673cae FG |
1462 | |
1463 | std::string get_corrupt_pg_log_name() const; | |
11fdf7f2 | 1464 | |
7c673cae FG |
1465 | void update_snap_map( |
1466 | const vector<pg_log_entry_t> &log_entries, | |
1467 | ObjectStore::Transaction& t); | |
1468 | ||
1469 | void filter_snapc(vector<snapid_t> &snaps); | |
1470 | ||
7c673cae FG |
1471 | virtual void kick_snap_trim() = 0; |
1472 | virtual void snap_trimmer_scrub_complete() = 0; | |
31f18b77 | 1473 | bool requeue_scrub(bool high_priority = false); |
c07f9fc5 | 1474 | void queue_recovery(); |
7c673cae FG |
1475 | bool queue_scrub(); |
1476 | unsigned get_scrub_priority(); | |
1477 | ||
9f95a23c TL |
1478 | bool try_flush_or_schedule_async() override; |
1479 | void start_flush_on_transaction( | |
1480 | ObjectStore::Transaction &t) override; | |
7c673cae | 1481 | |
9f95a23c TL |
1482 | void update_history(const pg_history_t& history) { |
1483 | recovery_state.update_history(history); | |
1484 | } | |
7c673cae FG |
1485 | |
1486 | // OpRequest queueing | |
1487 | bool can_discard_op(OpRequestRef& op); | |
1488 | bool can_discard_scan(OpRequestRef op); | |
1489 | bool can_discard_backfill(OpRequestRef op); | |
1490 | bool can_discard_request(OpRequestRef& op); | |
1491 | ||
1492 | template<typename T, int MSGTYPE> | |
1493 | bool can_discard_replica_op(OpRequestRef& op); | |
1494 | ||
1495 | bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch); | |
11fdf7f2 | 1496 | bool old_peering_evt(PGPeeringEventRef evt) { |
7c673cae FG |
1497 | return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested()); |
1498 | } | |
7c673cae | 1499 | bool have_same_or_newer_map(epoch_t e) { |
11fdf7f2 | 1500 | return e <= get_osdmap_epoch(); |
7c673cae FG |
1501 | } |
1502 | ||
1503 | bool op_has_sufficient_caps(OpRequestRef& op); | |
1504 | ||
7c673cae | 1505 | // abstract bits |
11fdf7f2 | 1506 | friend class FlushState; |
7c673cae | 1507 | |
9f95a23c TL |
1508 | friend ostream& operator<<(ostream& out, const PG& pg); |
1509 | ||
92f5a8d4 | 1510 | protected: |
9f95a23c | 1511 | PeeringState recovery_state; |
7c673cae | 1512 | |
9f95a23c TL |
1513 | // ref to recovery_state.pool |
1514 | const PGPool &pool; | |
1515 | ||
1516 | // ref to recovery_state.info | |
1517 | const pg_info_t &info; | |
7c673cae FG |
1518 | }; |
1519 | ||
7c673cae FG |
1520 | |
1521 | ostream& operator<<(ostream& out, const PG::BackfillInterval& bi); | |
1522 | ||
1523 | #endif |