]> git.proxmox.com Git - ceph.git/blame - ceph/src/osd/PG.h
update sources to 12.2.7
[ceph.git] / ceph / src / osd / PG.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef CEPH_PG_H
16#define CEPH_PG_H
17
18#include <boost/statechart/custom_reaction.hpp>
19#include <boost/statechart/event.hpp>
20#include <boost/statechart/simple_state.hpp>
21#include <boost/statechart/state.hpp>
22#include <boost/statechart/state_machine.hpp>
23#include <boost/statechart/transition.hpp>
24#include <boost/statechart/event_base.hpp>
25#include <boost/scoped_ptr.hpp>
26#include <boost/circular_buffer.hpp>
27#include "include/memory.h"
28#include "include/mempool.h"
29
30// re-include our assert to clobber boost's
31#include "include/assert.h"
32
33#include "include/types.h"
34#include "include/stringify.h"
35#include "osd_types.h"
36#include "include/xlist.h"
37#include "SnapMapper.h"
38#include "Session.h"
39#include "common/Timer.h"
40
41#include "PGLog.h"
42#include "OSDMap.h"
43#include "messages/MOSDPGLog.h"
44#include "include/str_list.h"
45#include "PGBackend.h"
46
47#include <atomic>
48#include <list>
49#include <memory>
50#include <stack>
51#include <string>
52#include <tuple>
53using namespace std;
54
55// #include "include/unordered_map.h"
56// #include "include/unordered_set.h"
57
58//#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
59
60class OSD;
61class OSDService;
62class MOSDOp;
63class MOSDPGScan;
64class MOSDPGBackfill;
65class MOSDPGInfo;
66
67class PG;
68struct OpRequest;
69typedef OpRequest::Ref OpRequestRef;
70class MOSDPGLog;
71class CephContext;
72
73namespace Scrub {
74 class Store;
75}
76
77void intrusive_ptr_add_ref(PG *pg);
78void intrusive_ptr_release(PG *pg);
79
80using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81using embedded_state = std::pair<utime_t, const char*>;
82
83struct PGStateInstance {
84 // Time spent in pg states
85
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
88 }
89
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
92 }
93
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
99 }
100
101 epoch_t this_epoch;
102 utime_t enter_time;
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
105};
106
107class PGStateHistory {
108 // Member access protected with the PG lock
109public:
110 PGStateHistory() : buffer(10) {}
111
112 void enter(PG* pg, const utime_t entime, const char* state);
113
114 void exit(const char* state);
115
116 void reset() {
117 pi = nullptr;
118 }
119
120 void set_pg_in_destructor() { pg_in_destructor = true; }
121
122 void dump(Formatter* f) const;
123
124private:
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
130
131};
132
133#ifdef PG_DEBUG_REFS
134#include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
138#else
139 typedef boost::intrusive_ptr<PG> PGRef;
140#endif
141
142class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
145 uint64_t events;
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
149
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
152 };
153 map<const char *,per_state_info> info;
154 Mutex lock;
155
156 public:
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
158
159 void reset() {
160 Mutex::Locker l(lock);
161 info.clear();
162 }
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
171 << p->first << "\n";
172 }
173 }
174
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
195 }
196 f->close_section();
197 f->close_section();
198 }
199 f->close_section();
200 }
201
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
204 info[s].enter++;
205 }
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
209 i.exit++;
210 i.total_time += dur;
211 if (dur > i.max_time)
212 i.max_time = dur;
213 if (dur < i.min_time || i.min_time == utime_t())
214 i.min_time = dur;
215 i.events += events;
216 i.event_time += event_dur;
217 }
218};
219
220struct PGPool {
221 CephContext* cct;
222 epoch_t cached_epoch;
223 int64_t id;
224 string name;
225 uint64_t auid;
226
227 pg_pool_t info;
228 SnapContext snapc; // the default pool snapc, ready to go.
229
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
232
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
234 : cct(cct),
235 cached_epoch(map->get_epoch()),
236 id(i),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
240 assert(pi);
241 info = *pi;
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
244 }
245
246 void update(OSDMapRef map);
247};
248
249/** PG - Replica Placement Group
250 *
251 */
252
253class PG : public DoutPrefixProvider {
254protected:
255 OSDService *osd;
256 CephContext *cct;
257 OSDriver osdriver;
258 SnapMapper snap_mapper;
224ce89b 259 bool eio_errors_to_process = false;
7c673cae
FG
260
261 virtual PGBackend *get_pgbackend() = 0;
262public:
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
266
267 /*** PG ****/
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
270 }
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
274 }
275protected:
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
278 PGPool pool;
279
280 void requeue_map_waiters();
281
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
285 }
286
287public:
288 OSDMapRef get_osdmap() const {
289 assert(is_locked());
290 assert(osdmap_ref);
291 return osdmap_ref;
292 }
293protected:
294
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
301 */
302 mutable Mutex _lock;
303 std::atomic_uint ref{0};
304
305#ifdef PG_DEBUG_REFS
306 Mutex _ref_id_lock;
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
309 uint64_t _ref_id;
310#endif
311
312public:
313 bool deleting; // true while in removing or OSD is shutting down
314
315 ZTracer::Endpoint trace_endpoint;
316
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
321 assert(!dirty_info);
322 assert(!dirty_big_info);
323 _lock.Unlock();
324 }
325
326 bool is_locked() const {
327 return _lock.is_locked();
328 }
329
330#ifdef PG_DEBUG_REFS
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
334#endif
335 void get(const char* tag);
336 void put(const char* tag);
337
338 bool dirty_info, dirty_big_info;
339
340public:
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
343 }
344 // pg state
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
347 __u8 info_struct_v;
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
357 }
358 bool can_upgrade() {
359 return info_struct_v >= compat_struct_v;
360 }
361 void upgrade(ObjectStore *store);
362
363 const coll_t coll;
364 ObjectStore::CollectionHandle ch;
365 PGLog pg_log;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
368 }
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
371 }
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
374 }
375 ghobject_t pgmeta_oid;
376
377 class MissingLoc {
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
381 PG *pg;
382 set<pg_shard_t> empty_set;
383 public:
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
387 : pg(pg) {}
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
393 }
394 string gen_prefix() const { return pg->gen_prefix(); }
395 bool needs_recovery(
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
401 return false;
402 if (v)
403 *v = i->second.need;
404 return true;
405 }
c07f9fc5
FG
406 bool is_deleted(const hobject_t &hoid) const {
407 auto i = needs_recovery_map.find(hoid);
408 if (i == needs_recovery_map.end())
409 return false;
410 return i->second.is_delete();
411 }
7c673cae 412 bool is_unfound(const hobject_t &hoid) const {
c07f9fc5 413 return needs_recovery(hoid) && !is_deleted(hoid) && (
7c673cae
FG
414 !missing_loc.count(hoid) ||
415 !(*is_recoverable)(missing_loc.find(hoid)->second));
416 }
417 bool readable_with_acting(
418 const hobject_t &hoid,
419 const set<pg_shard_t> &acting) const;
420 uint64_t num_unfound() const {
421 uint64_t ret = 0;
422 for (map<hobject_t, pg_missing_item>::const_iterator i =
423 needs_recovery_map.begin();
424 i != needs_recovery_map.end();
425 ++i) {
426 if (is_unfound(i->first))
427 ++ret;
428 }
429 return ret;
430 }
431
432 bool have_unfound() const {
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 needs_recovery_map.begin();
435 i != needs_recovery_map.end();
436 ++i) {
437 if (is_unfound(i->first))
438 return true;
439 }
440 return false;
441 }
442 void clear() {
443 needs_recovery_map.clear();
444 missing_loc.clear();
445 missing_loc_sources.clear();
446 }
447
448 void add_location(const hobject_t &hoid, pg_shard_t location) {
449 missing_loc[hoid].insert(location);
450 }
451 void remove_location(const hobject_t &hoid, pg_shard_t location) {
452 missing_loc[hoid].erase(location);
453 }
454 void add_active_missing(const pg_missing_t &missing) {
455 for (map<hobject_t, pg_missing_item>::const_iterator i =
456 missing.get_items().begin();
457 i != missing.get_items().end();
458 ++i) {
459 map<hobject_t, pg_missing_item>::const_iterator j =
460 needs_recovery_map.find(i->first);
461 if (j == needs_recovery_map.end()) {
462 needs_recovery_map.insert(*i);
463 } else {
c07f9fc5
FG
464 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
465 << i->first << " have " << j->second
466 << " tried to add " << i->second << dendl;
7c673cae
FG
467 assert(i->second.need == j->second.need);
468 }
469 }
470 }
471
472 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
473 needs_recovery_map[hoid] = pg_missing_item(need, have);
474 }
475 void revise_need(const hobject_t &hoid, eversion_t need) {
476 assert(needs_recovery(hoid));
477 needs_recovery_map[hoid].need = need;
478 }
479
480 /// Adds info about a possible recovery source
481 bool add_source_info(
482 pg_shard_t source, ///< [in] source
483 const pg_info_t &oinfo, ///< [in] info
484 const pg_missing_t &omissing, ///< [in] (optional) missing
485 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
486 ); ///< @return whether a new object location was discovered
487
488 /// Adds recovery sources in batch
489 void add_batch_sources_info(
490 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
491 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
492 );
493
494 /// Uses osdmap to update structures for now down sources
495 void check_recovery_sources(const OSDMapRef& osdmap);
496
497 /// Call when hoid is no longer missing in acting set
498 void recovered(const hobject_t &hoid) {
499 needs_recovery_map.erase(hoid);
500 missing_loc.erase(hoid);
501 }
502
503 /// Call to update structures for hoid after a change
504 void rebuild(
505 const hobject_t &hoid,
506 pg_shard_t self,
507 const set<pg_shard_t> to_recover,
508 const pg_info_t &info,
509 const pg_missing_t &missing,
510 const map<pg_shard_t, pg_missing_t> &pmissing,
511 const map<pg_shard_t, pg_info_t> &pinfo) {
512 recovered(hoid);
513 boost::optional<pg_missing_item> item;
514 auto miter = missing.get_items().find(hoid);
515 if (miter != missing.get_items().end()) {
516 item = miter->second;
517 } else {
518 for (auto &&i: to_recover) {
519 if (i == self)
520 continue;
521 auto pmiter = pmissing.find(i);
522 assert(pmiter != pmissing.end());
523 miter = pmiter->second.get_items().find(hoid);
524 if (miter != pmiter->second.get_items().end()) {
525 item = miter->second;
526 break;
527 }
528 }
529 }
530 if (!item)
531 return; // recovered!
532
533 needs_recovery_map[hoid] = *item;
c07f9fc5
FG
534 if (item->is_delete())
535 return;
7c673cae
FG
536 auto mliter =
537 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
538 assert(info.last_backfill.is_max());
539 assert(info.last_update >= item->need);
540 if (!missing.is_missing(hoid))
541 mliter->second.insert(self);
542 for (auto &&i: pmissing) {
543 auto pinfoiter = pinfo.find(i.first);
544 assert(pinfoiter != pinfo.end());
545 if (item->need <= pinfoiter->second.last_update &&
546 hoid <= pinfoiter->second.last_backfill &&
547 !i.second.is_missing(hoid))
548 mliter->second.insert(i.first);
549 }
550 }
551
552 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
553 return missing_loc.count(hoid) ?
554 missing_loc.find(hoid)->second : empty_set;
555 }
556 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
557 return missing_loc;
558 }
559 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
560 return needs_recovery_map;
561 }
562 } missing_loc;
563
564 PastIntervals past_intervals;
565
566 interval_set<snapid_t> snap_trimq;
567
568 /* You should not use these items without taking their respective queue locks
569 * (if they have one) */
570 xlist<PG*>::item stat_queue_item;
571 bool scrub_queued;
572 bool recovery_queued;
573
574 int recovery_ops_active;
575 set<pg_shard_t> waiting_on_backfill;
576#ifdef DEBUG_RECOVERY_OIDS
577 set<hobject_t> recovering_oids;
578#endif
579
580protected:
581 int role; // 0 = primary, 1 = replica, -1=none.
582 unsigned state; // PG_STATE_*
583
584 bool send_notify; ///< true if we are non-primary and should notify the primary
585
586public:
587 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
588 eversion_t last_complete_ondisk; // last_complete that has committed.
589 eversion_t last_update_applied;
590
591
592 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
593 PGRef pg;
594 epoch_t e;
595 eversion_t v;
596 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
597 : pg(pg), e(e), v(v) {}
598 void finish(int) override {
599 pg->lock();
600 if (!pg->pg_has_reset_since(e)) {
601 pg->last_rollback_info_trimmed_to_applied = v;
602 }
603 pg->unlock();
604 }
605 };
606 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
607 // and the transaction has applied
608 eversion_t last_rollback_info_trimmed_to_applied;
609
610 // primary state
611 public:
612 pg_shard_t primary;
613 pg_shard_t pg_whoami;
614 pg_shard_t up_primary;
615 vector<int> up, acting, want_acting;
616 set<pg_shard_t> actingbackfill, actingset, upset;
617 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
618 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
619 eversion_t pg_trim_to;
620
621 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
622
623 // [primary only] content recovery state
624
625public:
626 struct BufferedRecoveryMessages {
627 map<int, map<spg_t, pg_query_t> > query_map;
628 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
629 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
630 };
631
94b18763
FG
632public:
633 bool dne() { return info.dne(); }
7c673cae
FG
634 struct RecoveryCtx {
635 utime_t start_time;
636 map<int, map<spg_t, pg_query_t> > *query_map;
637 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
638 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
639 set<PGRef> created_pgs;
640 C_Contexts *on_applied;
641 C_Contexts *on_safe;
642 ObjectStore::Transaction *transaction;
643 ThreadPool::TPHandle* handle;
644 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
645 map<int,
646 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
647 map<int,
648 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
649 C_Contexts *on_applied,
650 C_Contexts *on_safe,
651 ObjectStore::Transaction *transaction)
652 : query_map(query_map), info_map(info_map),
653 notify_list(notify_list),
654 on_applied(on_applied),
655 on_safe(on_safe),
656 transaction(transaction),
657 handle(NULL) {}
658
659 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
660 : query_map(&(buf.query_map)),
661 info_map(&(buf.info_map)),
662 notify_list(&(buf.notify_list)),
663 on_applied(rctx.on_applied),
664 on_safe(rctx.on_safe),
665 transaction(rctx.transaction),
666 handle(rctx.handle) {}
667
668 void accept_buffered_messages(BufferedRecoveryMessages &m) {
669 assert(query_map);
670 assert(info_map);
671 assert(notify_list);
672 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
673 i != m.query_map.end();
674 ++i) {
675 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
676 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
677 j != i->second.end();
678 ++j) {
679 omap[j->first] = j->second;
680 }
681 }
682 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
683 = m.info_map.begin();
684 i != m.info_map.end();
685 ++i) {
686 vector<pair<pg_notify_t, PastIntervals> > &ovec =
687 (*info_map)[i->first];
688 ovec.reserve(ovec.size() + i->second.size());
689 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
690 }
691 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
692 = m.notify_list.begin();
693 i != m.notify_list.end();
694 ++i) {
695 vector<pair<pg_notify_t, PastIntervals> > &ovec =
696 (*notify_list)[i->first];
697 ovec.reserve(ovec.size() + i->second.size());
698 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
699 }
700 }
701 };
702
703
704 PGStateHistory pgstate_history;
705
706 struct NamedState {
707 const char *state_name;
708 utime_t enter_time;
709 PG* pg;
710 const char *get_state_name() { return state_name; }
711 NamedState(PG *pg_, const char *state_name_)
712 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
713 pg->pgstate_history.enter(pg, enter_time, state_name);
714 }
715 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
716 };
717
718
719
720protected:
721
722 /*
723 * peer_info -- projected (updates _before_ replicas ack)
724 * peer_missing -- committed (updates _after_ replicas ack)
725 */
726
727 bool need_up_thru;
728 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
729 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
730 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
731 set<pg_shard_t> peer_purged; // peers purged
732 map<pg_shard_t, pg_missing_t> peer_missing;
733 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
734 set<pg_shard_t> peer_missing_requested;
735
736 // i deleted these strays; ignore racing PGInfo from them
737 set<pg_shard_t> peer_activated;
738
739 // primary-only, recovery-only state
740 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
741 // which are unfound on the primary
742 epoch_t last_peering_reset;
743
744
745 /* heartbeat peers */
746 void set_probe_targets(const set<pg_shard_t> &probe_set);
747 void clear_probe_targets();
748public:
749 Mutex heartbeat_peer_lock;
750 set<int> heartbeat_peers;
751 set<int> probe_targets;
752
753 /**
754 * BackfillInterval
755 *
756 * Represents the objects in a range [begin, end)
757 *
758 * Possible states:
759 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
760 * 2) Else, objects contains all objects in [begin, end)
761 */
762 struct BackfillInterval {
763 // info about a backfill interval on a peer
764 eversion_t version; /// version at which the scan occurred
765 map<hobject_t,eversion_t> objects;
766 hobject_t begin;
767 hobject_t end;
768
769 /// clear content
770 void clear() {
771 *this = BackfillInterval();
772 }
773
774 /// clear objects list only
775 void clear_objects() {
776 objects.clear();
777 }
778
779 /// reinstantiate with a new start+end position and sort order
780 void reset(hobject_t start) {
781 clear();
782 begin = end = start;
783 }
784
785 /// true if there are no objects in this interval
786 bool empty() const {
787 return objects.empty();
788 }
789
790 /// true if interval extends to the end of the range
791 bool extends_to_end() const {
792 return end.is_max();
793 }
794
795 /// removes items <= soid and adjusts begin to the first object
796 void trim_to(const hobject_t &soid) {
797 trim();
798 while (!objects.empty() &&
799 objects.begin()->first <= soid) {
800 pop_front();
801 }
802 }
803
804 /// Adjusts begin to the first object
805 void trim() {
806 if (!objects.empty())
807 begin = objects.begin()->first;
808 else
809 begin = end;
810 }
811
812 /// drop first entry, and adjust @begin accordingly
813 void pop_front() {
814 assert(!objects.empty());
815 objects.erase(objects.begin());
816 trim();
817 }
818
819 /// dump
820 void dump(Formatter *f) const {
821 f->dump_stream("begin") << begin;
822 f->dump_stream("end") << end;
823 f->open_array_section("objects");
824 for (map<hobject_t, eversion_t>::const_iterator i =
825 objects.begin();
826 i != objects.end();
827 ++i) {
828 f->open_object_section("object");
829 f->dump_stream("object") << i->first;
830 f->dump_stream("version") << i->second;
831 f->close_section();
832 }
833 f->close_section();
834 }
835 };
836
837protected:
838 BackfillInterval backfill_info;
839 map<pg_shard_t, BackfillInterval> peer_backfill_info;
840 bool backfill_reserved;
841 bool backfill_reserving;
842
843 friend class OSD;
844
845public:
846 set<pg_shard_t> backfill_targets;
847
848 bool is_backfill_targets(pg_shard_t osd) {
849 return backfill_targets.count(osd);
850 }
851
852protected:
853
854 /*
855 * blocked request wait hierarchy
856 *
857 * In order to preserve request ordering we need to be careful about the
858 * order in which blocked requests get requeued. Generally speaking, we
859 * push the requests back up to the op_wq in reverse order (most recent
860 * request first) so that they come back out again in the original order.
861 * However, because there are multiple wait queues, we need to requeue
862 * waitlists in order. Generally speaking, we requeue the wait lists
863 * that are checked first.
864 *
865 * Here are the various wait lists, in the order they are used during
866 * request processing, with notes:
867 *
868 * - waiting_for_map
869 * - may start or stop blocking at any time (depending on client epoch)
870 * - waiting_for_peered
871 * - !is_peered() or flushes_in_progress
872 * - only starts blocking on interval change; never restarts
873 * - waiting_for_active
874 * - !is_active()
875 * - only starts blocking on interval change; never restarts
b32b8144
FG
876 * - waiting_for_flush
877 * - is_active() and flushes_in_progress
878 * - waiting for final flush during activate
7c673cae
FG
879 * - waiting_for_scrub
880 * - starts and stops blocking for varying intervals during scrub
881 * - waiting_for_unreadable_object
882 * - never restarts once object is readable (* except for EIO?)
883 * - waiting_for_degraded_object
884 * - never restarts once object is writeable (* except for EIO?)
885 * - waiting_for_blocked_object
886 * - starts and stops based on proxied op activity
887 * - obc rwlocks
888 * - starts and stops based on read/write activity
889 *
890 * Notes:
891 *
892 * 1. During and interval change, we requeue *everything* in the above order.
893 *
894 * 2. When an obc rwlock is released, we check for a scrub block and requeue
895 * the op there if it applies. We ignore the unreadable/degraded/blocked
896 * queues because we assume they cannot apply at that time (this is
897 * probably mostly true).
898 *
899 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
900 * it is non-empty.
901 *
902 * These three behaviors are generally sufficient to maintain ordering, with
903 * the possible exception of cases where we make an object degraded or
904 * unreadable that was previously okay, e.g. when scrub or op processing
905 * encounter an unexpected error. FIXME.
906 */
907
908 // pg waiters
909 unsigned flushes_in_progress;
910
911 // ops with newer maps than our (or blocked behind them)
912 // track these by client, since inter-request ordering doesn't otherwise
913 // matter.
914 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
915
916 // ops waiting on peered
917 list<OpRequestRef> waiting_for_peered;
918
919 // ops waiting on active (require peered as well)
920 list<OpRequestRef> waiting_for_active;
b32b8144 921 list<OpRequestRef> waiting_for_flush;
7c673cae
FG
922 list<OpRequestRef> waiting_for_scrub;
923
924 list<OpRequestRef> waiting_for_cache_not_full;
224ce89b 925 list<OpRequestRef> waiting_for_clean_to_primary_repair;
7c673cae
FG
926 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
927 waiting_for_degraded_object,
928 waiting_for_blocked_object;
929
930 set<hobject_t> objects_blocked_on_cache_full;
931 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
932 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
933
934 // Callbacks should assume pg (and nothing else) is locked
935 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
936
937 map<eversion_t,
938 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
939
940 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
941 void requeue_op(OpRequestRef op);
942 void requeue_ops(list<OpRequestRef> &l);
943
944 // stats that persist lazily
945 object_stat_collection_t unstable_stats;
946
947 // publish stats
948 Mutex pg_stats_publish_lock;
949 bool pg_stats_publish_valid;
950 pg_stat_t pg_stats_publish;
951
952 // for ordering writes
953 ceph::shared_ptr<ObjectStore::Sequencer> osr;
954
955 void _update_calc_stats();
956 void _update_blocked_by();
957 void publish_stats_to_osd();
958 void clear_publish_stats();
959
960public:
961 void clear_primary_state();
962
7c673cae
FG
963 bool is_actingbackfill(pg_shard_t osd) const {
964 return actingbackfill.count(osd);
965 }
966 bool is_acting(pg_shard_t osd) const {
967 return has_shard(pool.info.ec_pool(), acting, osd);
968 }
969 bool is_up(pg_shard_t osd) const {
970 return has_shard(pool.info.ec_pool(), up, osd);
971 }
972 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
973 if (ec) {
974 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
975 } else {
976 return std::find(v.begin(), v.end(), osd.osd) != v.end();
977 }
978 }
979
980 bool needs_recovery() const;
981 bool needs_backfill() const;
982
c07f9fc5
FG
983 /// clip calculated priority to reasonable range
984 inline int clamp_recovery_priority(int priority);
7c673cae
FG
985 /// get log recovery reservation priority
986 unsigned get_recovery_priority();
987 /// get backfill reservation priority
988 unsigned get_backfill_priority();
989
990 void mark_clean(); ///< mark an active pg clean
d2e6a577 991 void _change_recovery_force_mode(int new_mode, bool clear);
7c673cae
FG
992
993 /// return [start,end) bounds for required past_intervals
994 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
995 const pg_info_t &info,
996 epoch_t oldest_map) {
997 epoch_t start = MAX(
998 info.history.last_epoch_clean ? info.history.last_epoch_clean :
31f18b77 999 info.history.epoch_pool_created,
7c673cae
FG
1000 oldest_map);
1001 epoch_t end = MAX(
1002 info.history.same_interval_since,
31f18b77 1003 info.history.epoch_pool_created);
7c673cae
FG
1004 return make_pair(start, end);
1005 }
1006 void check_past_interval_bounds() const;
1007 PastIntervals::PriorSet build_prior();
1008
1009 void remove_down_peer_info(const OSDMapRef osdmap);
1010
1011 bool adjust_need_up_thru(const OSDMapRef osdmap);
1012
1013 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1014 virtual void dump_recovery_info(Formatter *f) const = 0;
1015
1016 bool calc_min_last_complete_ondisk() {
1017 eversion_t min = last_complete_ondisk;
1018 assert(!actingbackfill.empty());
1019 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1020 i != actingbackfill.end();
1021 ++i) {
1022 if (*i == get_primary()) continue;
1023 if (peer_last_complete_ondisk.count(*i) == 0)
1024 return false; // we don't have complete info
1025 eversion_t a = peer_last_complete_ondisk[*i];
1026 if (a < min)
1027 min = a;
1028 }
1029 if (min == min_last_complete_ondisk)
1030 return false;
1031 min_last_complete_ondisk = min;
1032 return true;
1033 }
1034
1035 virtual void calc_trim_to() = 0;
1036
1037 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1038 pg_missing_t& omissing, pg_shard_t from);
1039 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1040 pg_missing_t& omissing, pg_shard_t from);
1041 bool proc_replica_info(
1042 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1043
1044 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1045 PG *pg;
1046 ObjectStore::Transaction *t;
1047 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1048
1049 // LogEntryHandler
1050 void remove(const hobject_t &hoid) override {
1051 pg->get_pgbackend()->remove(hoid, t);
1052 }
1053 void try_stash(const hobject_t &hoid, version_t v) override {
1054 pg->get_pgbackend()->try_stash(hoid, v, t);
1055 }
1056 void rollback(const pg_log_entry_t &entry) override {
1057 assert(entry.can_rollback());
1058 pg->get_pgbackend()->rollback(entry, t);
1059 }
1060 void rollforward(const pg_log_entry_t &entry) override {
1061 pg->get_pgbackend()->rollforward(entry, t);
1062 }
1063 void trim(const pg_log_entry_t &entry) override {
1064 pg->get_pgbackend()->trim(entry, t);
1065 }
1066 };
1067
1068 void update_object_snap_mapping(
1069 ObjectStore::Transaction *t, const hobject_t &soid,
1070 const set<snapid_t> &snaps);
1071 void clear_object_snap_mapping(
1072 ObjectStore::Transaction *t, const hobject_t &soid);
1073 void remove_snap_mapped_object(
1074 ObjectStore::Transaction& t, const hobject_t& soid);
1075 void merge_log(
1076 ObjectStore::Transaction& t, pg_info_t &oinfo,
1077 pg_log_t &olog, pg_shard_t from);
1078 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1079 bool search_for_missing(
1080 const pg_info_t &oinfo, const pg_missing_t &omissing,
1081 pg_shard_t fromosd,
1082 RecoveryCtx*);
1083
1084 void check_for_lost_objects();
1085 void forget_lost_objects();
1086
1087 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1088
1089 void trim_write_ahead();
1090
1091 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1092 const map<pg_shard_t, pg_info_t> &infos,
1093 bool restrict_to_up_acting,
1094 bool *history_les_bound) const;
1095 static void calc_ec_acting(
1096 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1097 unsigned size,
1098 const vector<int> &acting,
1099 pg_shard_t acting_primary,
1100 const vector<int> &up,
1101 pg_shard_t up_primary,
1102 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1103 bool restrict_to_up_acting,
1104 vector<int> *want,
1105 set<pg_shard_t> *backfill,
1106 set<pg_shard_t> *acting_backfill,
1107 pg_shard_t *want_primary,
1108 ostream &ss);
1109 static void calc_replicated_acting(
1110 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1111 unsigned size,
1112 const vector<int> &acting,
1113 pg_shard_t acting_primary,
1114 const vector<int> &up,
1115 pg_shard_t up_primary,
1116 const map<pg_shard_t, pg_info_t> &all_info,
7c673cae
FG
1117 bool restrict_to_up_acting,
1118 vector<int> *want,
1119 set<pg_shard_t> *backfill,
1120 set<pg_shard_t> *acting_backfill,
1121 pg_shard_t *want_primary,
1122 ostream &ss);
1123 bool choose_acting(pg_shard_t &auth_log_shard,
1124 bool restrict_to_up_acting,
1125 bool *history_les_bound);
1126 void build_might_have_unfound();
1127 void activate(
1128 ObjectStore::Transaction& t,
1129 epoch_t activation_epoch,
1130 list<Context*>& tfin,
1131 map<int, map<spg_t,pg_query_t> >& query_map,
1132 map<int,
1133 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1134 RecoveryCtx *ctx);
1135 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1136 void all_activated_and_committed();
1137
1138 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1139
1140 bool have_unfound() const {
1141 return missing_loc.have_unfound();
1142 }
1143 uint64_t get_num_unfound() const {
1144 return missing_loc.num_unfound();
1145 }
1146
1147 virtual void check_local() = 0;
1148
1149 /**
1150 * @param ops_begun returns how many recovery ops the function started
1151 * @returns true if any useful work was accomplished; false otherwise
1152 */
1153 virtual bool start_recovery_ops(
1154 uint64_t max,
1155 ThreadPool::TPHandle &handle,
1156 uint64_t *ops_begun) = 0;
1157
1158 void purge_strays();
1159
1160 void update_heartbeat_peers();
1161
1162 Context *finish_sync_event;
1163
1164 void finish_recovery(list<Context*>& tfin);
1165 void _finish_recovery(Context *c);
1166 void cancel_recovery();
1167 void clear_recovery_state();
1168 virtual void _clear_recovery_state() = 0;
1169 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1170 void start_recovery_op(const hobject_t& soid);
1171 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1172
1173 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1174 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1175
1176 friend class C_OSD_RepModify_Commit;
1177
1178 // -- backoff --
1179 Mutex backoff_lock; // orders inside Backoff::lock
1180 map<hobject_t,set<BackoffRef>> backoffs;
1181
1182 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1183 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1184 void release_backoffs(const hobject_t& o) {
1185 release_backoffs(o, o);
1186 }
1187 void clear_backoffs();
1188
1189 void add_pg_backoff(SessionRef s) {
1190 hobject_t begin = info.pgid.pgid.get_hobj_start();
1191 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1192 add_backoff(s, begin, end);
1193 }
1194 void release_pg_backoffs() {
1195 hobject_t begin = info.pgid.pgid.get_hobj_start();
1196 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1197 release_backoffs(begin, end);
1198 }
1199
1200 void rm_backoff(BackoffRef b);
1201
1202 // -- scrub --
1203 struct Scrubber {
1204 Scrubber();
1205 ~Scrubber();
1206
1207 // metadata
1208 set<pg_shard_t> reserved_peers;
1209 bool reserved, reserve_failed;
1210 epoch_t epoch_start;
1211
1212 // common to both scrubs
1213 bool active;
7c673cae
FG
1214 set<pg_shard_t> waiting_on_whom;
1215 int shallow_errors;
1216 int deep_errors;
28e407b8 1217 int large_omap_objects = 0;
7c673cae
FG
1218 int fixed;
1219 ScrubMap primary_scrubmap;
28e407b8
AA
1220 ScrubMapBuilder primary_scrubmap_pos;
1221 epoch_t replica_scrub_start = 0;
1222 ScrubMap replica_scrubmap;
1223 ScrubMapBuilder replica_scrubmap_pos;
7c673cae
FG
1224 map<pg_shard_t, ScrubMap> received_maps;
1225 OpRequestRef active_rep_scrub;
1226 utime_t scrub_reg_stamp; // stamp we registered for
1227
1228 // For async sleep
1229 bool sleeping = false;
1230 bool needs_sleep = true;
1231 utime_t sleep_start;
1232
1233 // flags to indicate explicitly requested scrubs (by admin)
1234 bool must_scrub, must_deep_scrub, must_repair;
1235
1236 // Priority to use for scrub scheduling
1237 unsigned priority;
1238
1239 // this flag indicates whether we would like to do auto-repair of the PG or not
1240 bool auto_repair;
1241
1242 // Maps from objects with errors to missing/inconsistent peers
1243 map<hobject_t, set<pg_shard_t>> missing;
1244 map<hobject_t, set<pg_shard_t>> inconsistent;
1245
1246 // Map from object with errors to good peers
1247 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1248
1249 // Cleaned map pending snap metadata scrub
1250 ScrubMap cleaned_meta_map;
1251
28e407b8
AA
1252 void clean_meta_map(ScrubMap &for_meta_scrub) {
1253 if (end.is_max() ||
1254 cleaned_meta_map.objects.empty()) {
1255 cleaned_meta_map.swap(for_meta_scrub);
1256 } else {
1257 auto iter = cleaned_meta_map.objects.end();
1258 --iter; // not empty, see if clause
1259 auto begin = cleaned_meta_map.objects.begin();
1260 if (iter->first.has_snapset()) {
1261 ++iter;
1262 } else {
1263 while (iter != begin) {
1264 auto next = iter--;
1265 if (next->first.get_head() != iter->first.get_head()) {
1266 ++iter;
1267 break;
1268 }
1269 }
1270 }
1271 for_meta_scrub.objects.insert(begin, iter);
1272 cleaned_meta_map.objects.erase(begin, iter);
1273 }
1274 }
1275
7c673cae
FG
1276 // digest updates which we are waiting on
1277 int num_digest_updates_pending;
1278
1279 // chunky scrub
28e407b8
AA
1280 hobject_t start, end; // [start,end)
1281 hobject_t max_end; // Largest end that may have been sent to replicas
7c673cae
FG
1282 eversion_t subset_last_update;
1283
1284 // chunky scrub state
1285 enum State {
1286 INACTIVE,
1287 NEW_CHUNK,
1288 WAIT_PUSHES,
1289 WAIT_LAST_UPDATE,
1290 BUILD_MAP,
28e407b8 1291 BUILD_MAP_DONE,
7c673cae
FG
1292 WAIT_REPLICAS,
1293 COMPARE_MAPS,
1294 WAIT_DIGEST_UPDATES,
1295 FINISH,
28e407b8 1296 BUILD_MAP_REPLICA,
7c673cae
FG
1297 } state;
1298
1299 std::unique_ptr<Scrub::Store> store;
1300 // deep scrub
1301 bool deep;
28e407b8
AA
1302 int preempt_left;
1303 int preempt_divisor;
7c673cae
FG
1304
1305 list<Context*> callbacks;
1306 void add_callback(Context *context) {
1307 callbacks.push_back(context);
1308 }
1309 void run_callbacks() {
1310 list<Context*> to_run;
1311 to_run.swap(callbacks);
1312 for (list<Context*>::iterator i = to_run.begin();
1313 i != to_run.end();
1314 ++i) {
1315 (*i)->complete(0);
1316 }
1317 }
1318
1319 static const char *state_string(const PG::Scrubber::State& state) {
1320 const char *ret = NULL;
1321 switch( state )
1322 {
1323 case INACTIVE: ret = "INACTIVE"; break;
1324 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1325 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1326 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1327 case BUILD_MAP: ret = "BUILD_MAP"; break;
28e407b8 1328 case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
7c673cae
FG
1329 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1330 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1331 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1332 case FINISH: ret = "FINISH"; break;
28e407b8 1333 case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
7c673cae
FG
1334 }
1335 return ret;
1336 }
1337
1338 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1339
7c673cae
FG
1340 // clear all state
1341 void reset() {
1342 active = false;
7c673cae
FG
1343 waiting_on_whom.clear();
1344 if (active_rep_scrub) {
1345 active_rep_scrub = OpRequestRef();
1346 }
1347 received_maps.clear();
1348
1349 must_scrub = false;
1350 must_deep_scrub = false;
1351 must_repair = false;
1352 auto_repair = false;
1353
1354 state = PG::Scrubber::INACTIVE;
1355 start = hobject_t();
1356 end = hobject_t();
28e407b8 1357 max_end = hobject_t();
7c673cae
FG
1358 subset_last_update = eversion_t();
1359 shallow_errors = 0;
1360 deep_errors = 0;
28e407b8 1361 large_omap_objects = 0;
7c673cae
FG
1362 fixed = 0;
1363 deep = false;
7c673cae
FG
1364 run_callbacks();
1365 inconsistent.clear();
1366 missing.clear();
1367 authoritative.clear();
1368 num_digest_updates_pending = 0;
28e407b8
AA
1369 primary_scrubmap = ScrubMap();
1370 primary_scrubmap_pos.reset();
1371 replica_scrubmap = ScrubMap();
1372 replica_scrubmap_pos.reset();
7c673cae
FG
1373 cleaned_meta_map = ScrubMap();
1374 sleeping = false;
1375 needs_sleep = true;
1376 sleep_start = utime_t();
1377 }
1378
1379 void create_results(const hobject_t& obj);
1380 void cleanup_store(ObjectStore::Transaction *t);
1381 } scrubber;
1382
1383 bool scrub_after_recovery;
1384
1385 int active_pushes;
1386
28e407b8
AA
1387 bool scrub_can_preempt = false;
1388 bool scrub_preempted = false;
1389
1390 // we allow some number of preemptions of the scrub, which mean we do
1391 // not block. then we start to block. once we start blocking, we do
1392 // not stop until the scrub range is completed.
1393 bool write_blocked_by_scrub(const hobject_t &soid);
1394
1395 /// true if the given range intersects the scrub interval in any way
1396 bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1397
7c673cae
FG
1398 void repair_object(
1399 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1400 pg_shard_t bad_peer);
1401
1402 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1403 void chunky_scrub(ThreadPool::TPHandle &handle);
1404 void scrub_compare_maps();
1405 /**
1406 * return true if any inconsistency/missing is repaired, false otherwise
1407 */
1408 bool scrub_process_inconsistent();
31f18b77 1409 bool ops_blocked_by_scrub() const;
7c673cae
FG
1410 void scrub_finish();
1411 void scrub_clear_state();
1412 void _scan_snaps(ScrubMap &map);
224ce89b 1413 void _repair_oinfo_oid(ScrubMap &map);
7c673cae
FG
1414 void _scan_rollback_obs(
1415 const vector<ghobject_t> &rollback_obs,
1416 ThreadPool::TPHandle &handle);
1417 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1418 hobject_t start, hobject_t end, bool deep,
28e407b8 1419 bool allow_preemption);
7c673cae
FG
1420 int build_scrub_map_chunk(
1421 ScrubMap &map,
28e407b8
AA
1422 ScrubMapBuilder &pos,
1423 hobject_t start, hobject_t end, bool deep,
7c673cae
FG
1424 ThreadPool::TPHandle &handle);
1425 /**
1426 * returns true if [begin, end) is good to scrub at this time
1427 * a false return value obliges the implementer to requeue scrub when the
1428 * condition preventing scrub clears
1429 */
1430 virtual bool _range_available_for_scrub(
1431 const hobject_t &begin, const hobject_t &end) = 0;
1432 virtual void scrub_snapshot_metadata(
1433 ScrubMap &map,
28e407b8
AA
1434 const std::map<hobject_t,
1435 pair<boost::optional<uint32_t>,
1436 boost::optional<uint32_t>>> &missing_digest) { }
7c673cae
FG
1437 virtual void _scrub_clear_state() { }
1438 virtual void _scrub_finish() { }
1439 virtual void split_colls(
1440 spg_t child,
1441 int split_bits,
1442 int seed,
1443 const pg_pool_t *pool,
1444 ObjectStore::Transaction *t) = 0;
1445 void clear_scrub_reserved();
1446 void scrub_reserve_replicas();
1447 void scrub_unreserve_replicas();
1448 bool scrub_all_replicas_reserved() const;
1449 bool sched_scrub();
1450 void reg_next_scrub();
1451 void unreg_next_scrub();
1452
1453 void replica_scrub(
1454 OpRequestRef op,
1455 ThreadPool::TPHandle &handle);
1456 void do_replica_scrub_map(OpRequestRef op);
1457 void sub_op_scrub_map(OpRequestRef op);
1458
1459 void handle_scrub_reserve_request(OpRequestRef op);
1460 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1461 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1462 void handle_scrub_reserve_release(OpRequestRef op);
1463
1464 void reject_reservation();
3efd9988
FG
1465 void schedule_backfill_retry(float retry);
1466 void schedule_recovery_retry(float retry);
7c673cae
FG
1467
1468 // -- recovery state --
1469
1470 template <class EVT>
1471 struct QueuePeeringEvt : Context {
1472 PGRef pg;
1473 epoch_t epoch;
1474 EVT evt;
1475 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1476 pg(pg), epoch(epoch), evt(evt) {}
1477 void finish(int r) override {
1478 pg->lock();
1479 pg->queue_peering_event(PG::CephPeeringEvtRef(
1480 new PG::CephPeeringEvt(
1481 epoch,
1482 epoch,
1483 evt)));
1484 pg->unlock();
1485 }
1486 };
1487
1488 class CephPeeringEvt {
1489 epoch_t epoch_sent;
1490 epoch_t epoch_requested;
1491 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1492 string desc;
1493 public:
1494 MEMPOOL_CLASS_HELPERS();
1495 template <class T>
1496 CephPeeringEvt(epoch_t epoch_sent,
1497 epoch_t epoch_requested,
1498 const T &evt_) :
1499 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1500 evt(evt_.intrusive_from_this()) {
1501 stringstream out;
1502 out << "epoch_sent: " << epoch_sent
1503 << " epoch_requested: " << epoch_requested << " ";
1504 evt_.print(&out);
1505 desc = out.str();
1506 }
1507 epoch_t get_epoch_sent() { return epoch_sent; }
1508 epoch_t get_epoch_requested() { return epoch_requested; }
1509 const boost::statechart::event_base &get_event() { return *evt; }
1510 string get_desc() { return desc; }
1511 };
1512 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1513 list<CephPeeringEvtRef> peering_queue; // op queue
1514 list<CephPeeringEvtRef> peering_waiters;
1515
1516 struct QueryState : boost::statechart::event< QueryState > {
1517 Formatter *f;
1518 explicit QueryState(Formatter *f) : f(f) {}
1519 void print(std::ostream *out) const {
1520 *out << "Query";
1521 }
1522 };
1523
1524 struct MInfoRec : boost::statechart::event< MInfoRec > {
1525 pg_shard_t from;
1526 pg_info_t info;
1527 epoch_t msg_epoch;
1528 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1529 from(from), info(info), msg_epoch(msg_epoch) {}
1530 void print(std::ostream *out) const {
1531 *out << "MInfoRec from " << from << " info: " << info;
1532 }
1533 };
1534
1535 struct MLogRec : boost::statechart::event< MLogRec > {
1536 pg_shard_t from;
1537 boost::intrusive_ptr<MOSDPGLog> msg;
1538 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1539 from(from), msg(msg) {}
1540 void print(std::ostream *out) const {
1541 *out << "MLogRec from " << from;
1542 }
1543 };
1544
1545 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1546 pg_shard_t from;
1547 pg_notify_t notify;
1548 uint64_t features;
1549 MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
1550 from(from), notify(notify), features(f) {}
1551 void print(std::ostream *out) const {
1552 *out << "MNotifyRec from " << from << " notify: " << notify
1553 << " features: 0x" << hex << features << dec;
1554 }
1555 };
1556
1557 struct MQuery : boost::statechart::event< MQuery > {
1558 pg_shard_t from;
1559 pg_query_t query;
1560 epoch_t query_epoch;
1561 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1562 from(from), query(query), query_epoch(query_epoch) {}
1563 void print(std::ostream *out) const {
1564 *out << "MQuery from " << from
1565 << " query_epoch " << query_epoch
1566 << " query: " << query;
1567 }
1568 };
1569
1570 struct AdvMap : boost::statechart::event< AdvMap > {
1571 OSDMapRef osdmap;
1572 OSDMapRef lastmap;
1573 vector<int> newup, newacting;
1574 int up_primary, acting_primary;
1575 AdvMap(
1576 OSDMapRef osdmap, OSDMapRef lastmap,
1577 vector<int>& newup, int up_primary,
1578 vector<int>& newacting, int acting_primary):
1579 osdmap(osdmap), lastmap(lastmap),
1580 newup(newup),
1581 newacting(newacting),
1582 up_primary(up_primary),
1583 acting_primary(acting_primary) {}
1584 void print(std::ostream *out) const {
1585 *out << "AdvMap";
1586 }
1587 };
1588
1589 struct ActMap : boost::statechart::event< ActMap > {
1590 ActMap() : boost::statechart::event< ActMap >() {}
1591 void print(std::ostream *out) const {
1592 *out << "ActMap";
1593 }
1594 };
1595 struct Activate : boost::statechart::event< Activate > {
1596 epoch_t activation_epoch;
1597 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1598 activation_epoch(q) {}
1599 void print(std::ostream *out) const {
1600 *out << "Activate from " << activation_epoch;
1601 }
1602 };
1603 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1604 unsigned priority;
1605 explicit RequestBackfillPrio(unsigned prio) :
1606 boost::statechart::event< RequestBackfillPrio >(),
1607 priority(prio) {}
1608 void print(std::ostream *out) const {
1609 *out << "RequestBackfillPrio: priority " << priority;
1610 }
1611 };
1612#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1613 T() : boost::statechart::event< T >() {} \
1614 void print(std::ostream *out) const { \
1615 *out << #T; \
1616 } \
1617 };
3efd9988
FG
1618 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1619 float delay;
1620 explicit DeferBackfill(float delay) : delay(delay) {}
1621 void print(std::ostream *out) const {
1622 *out << "DeferBackfill: delay " << delay;
1623 }
1624 };
1625 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1626 float delay;
1627 explicit DeferRecovery(float delay) : delay(delay) {}
1628 void print(std::ostream *out) const {
1629 *out << "DeferRecovery: delay " << delay;
1630 }
1631 };
b32b8144
FG
1632 struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
1633 explicit UnfoundBackfill() {}
1634 void print(std::ostream *out) const {
1635 *out << "UnfoundBackfill";
1636 }
1637 };
1638 struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
1639 explicit UnfoundRecovery() {}
1640 void print(std::ostream *out) const {
1641 *out << "UnfoundRecovery";
1642 }
1643 };
1644protected:
7c673cae
FG
1645 TrivialEvent(Initialize)
1646 TrivialEvent(Load)
1647 TrivialEvent(GotInfo)
1648 TrivialEvent(NeedUpThru)
1649 TrivialEvent(NullEvt)
1650 TrivialEvent(FlushedEvt)
1651 TrivialEvent(Backfilled)
1652 TrivialEvent(LocalBackfillReserved)
1653 TrivialEvent(RemoteBackfillReserved)
3efd9988 1654 TrivialEvent(RejectRemoteReservation)
7c673cae 1655 TrivialEvent(RemoteReservationRejected)
3efd9988 1656 TrivialEvent(RemoteReservationCanceled)
7c673cae
FG
1657 TrivialEvent(RequestBackfill)
1658 TrivialEvent(RequestRecovery)
1659 TrivialEvent(RecoveryDone)
1660 TrivialEvent(BackfillTooFull)
1661 TrivialEvent(RecoveryTooFull)
3efd9988
FG
1662
1663 TrivialEvent(MakePrimary)
1664 TrivialEvent(MakeStray)
1665 TrivialEvent(NeedActingChange)
1666 TrivialEvent(IsIncomplete)
1667 TrivialEvent(IsDown)
7c673cae
FG
1668
1669 TrivialEvent(AllReplicasRecovered)
1670 TrivialEvent(DoRecovery)
1671 TrivialEvent(LocalRecoveryReserved)
1672 TrivialEvent(RemoteRecoveryReserved)
1673 TrivialEvent(AllRemotesReserved)
1674 TrivialEvent(AllBackfillsReserved)
1675 TrivialEvent(GoClean)
1676
1677 TrivialEvent(AllReplicasActivated)
1678
1679 TrivialEvent(IntervalFlush)
1680
1681 /* Encapsulates PG recovery process */
1682 class RecoveryState {
1683 void start_handle(RecoveryCtx *new_ctx);
1684 void end_handle();
1685 public:
1686 void begin_block_outgoing();
1687 void end_block_outgoing();
1688 void clear_blocked_outgoing();
1689 private:
1690
1691 /* States */
1692 struct Initial;
1693 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1694 RecoveryState *state;
1695 public:
1696 PG *pg;
1697
1698 utime_t event_time;
1699 uint64_t event_count;
1700
1701 void clear_event_counters() {
1702 event_time = utime_t();
1703 event_count = 0;
1704 }
1705
1706 void log_enter(const char *state_name);
1707 void log_exit(const char *state_name, utime_t duration);
1708
1709 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1710
1711 /* Accessor functions for state methods */
1712 ObjectStore::Transaction* get_cur_transaction() {
1713 assert(state->rctx);
1714 assert(state->rctx->transaction);
1715 return state->rctx->transaction;
1716 }
1717
1718 void send_query(pg_shard_t to, const pg_query_t &query) {
1719 assert(state->rctx);
1720 assert(state->rctx->query_map);
1721 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1722 query;
1723 }
1724
1725 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1726 assert(state->rctx);
1727 assert(state->rctx->query_map);
1728 return state->rctx->query_map;
1729 }
1730
1731 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1732 assert(state->rctx);
1733 assert(state->rctx->info_map);
1734 return state->rctx->info_map;
1735 }
1736
1737 list< Context* > *get_on_safe_context_list() {
1738 assert(state->rctx);
1739 assert(state->rctx->on_safe);
1740 return &(state->rctx->on_safe->contexts);
1741 }
1742
1743 list< Context * > *get_on_applied_context_list() {
1744 assert(state->rctx);
1745 assert(state->rctx->on_applied);
1746 return &(state->rctx->on_applied->contexts);
1747 }
1748
1749 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1750
1751 void send_notify(pg_shard_t to,
1752 const pg_notify_t &info, const PastIntervals &pi) {
1753 assert(state->rctx);
1754 assert(state->rctx->notify_list);
1755 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1756 }
1757 };
1758 friend class RecoveryMachine;
1759
1760 /* States */
b32b8144
FG
1761 // Initial
1762 // Reset
1763 // Start
1764 // Started
1765 // Primary
1766 // WaitActingChange
1767 // Peering
1768 // GetInfo
1769 // GetLog
1770 // GetMissing
1771 // WaitUpThru
1772 // Incomplete
1773 // Active
1774 // Activating
1775 // Clean
1776 // Recovered
1777 // Backfilling
1778 // WaitRemoteBackfillReserved
1779 // WaitLocalBackfillReserved
1780 // NotBackfilling
1781 // NotRecovering
1782 // Recovering
1783 // WaitRemoteRecoveryReserved
1784 // WaitLocalRecoveryReserved
1785 // ReplicaActive
1786 // RepNotRecovering
1787 // RepRecovering
1788 // RepWaitBackfillReserved
1789 // RepWaitRecoveryReserved
1790 // Stray
7c673cae
FG
1791
1792 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1793 explicit Crashed(my_context ctx);
1794 };
1795
1796 struct Reset;
1797
1798 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1799 explicit Initial(my_context ctx);
1800 void exit();
1801
1802 typedef boost::mpl::list <
1803 boost::statechart::transition< Initialize, Reset >,
1804 boost::statechart::custom_reaction< Load >,
1805 boost::statechart::custom_reaction< NullEvt >,
1806 boost::statechart::transition< boost::statechart::event_base, Crashed >
1807 > reactions;
1808
1809 boost::statechart::result react(const Load&);
1810 boost::statechart::result react(const MNotifyRec&);
1811 boost::statechart::result react(const MInfoRec&);
1812 boost::statechart::result react(const MLogRec&);
1813 boost::statechart::result react(const boost::statechart::event_base&) {
1814 return discard_event();
1815 }
1816 };
1817
1818 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1819 explicit Reset(my_context ctx);
1820 void exit();
1821
1822 typedef boost::mpl::list <
1823 boost::statechart::custom_reaction< QueryState >,
1824 boost::statechart::custom_reaction< AdvMap >,
1825 boost::statechart::custom_reaction< ActMap >,
1826 boost::statechart::custom_reaction< NullEvt >,
1827 boost::statechart::custom_reaction< FlushedEvt >,
1828 boost::statechart::custom_reaction< IntervalFlush >,
1829 boost::statechart::transition< boost::statechart::event_base, Crashed >
1830 > reactions;
1831 boost::statechart::result react(const QueryState& q);
1832 boost::statechart::result react(const AdvMap&);
1833 boost::statechart::result react(const ActMap&);
1834 boost::statechart::result react(const FlushedEvt&);
1835 boost::statechart::result react(const IntervalFlush&);
1836 boost::statechart::result react(const boost::statechart::event_base&) {
1837 return discard_event();
1838 }
1839 };
1840
1841 struct Start;
1842
1843 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1844 explicit Started(my_context ctx);
1845 void exit();
1846
1847 typedef boost::mpl::list <
1848 boost::statechart::custom_reaction< QueryState >,
1849 boost::statechart::custom_reaction< AdvMap >,
1850 boost::statechart::custom_reaction< NullEvt >,
1851 boost::statechart::custom_reaction< FlushedEvt >,
1852 boost::statechart::custom_reaction< IntervalFlush >,
1853 boost::statechart::transition< boost::statechart::event_base, Crashed >
1854 > reactions;
1855 boost::statechart::result react(const QueryState& q);
1856 boost::statechart::result react(const AdvMap&);
1857 boost::statechart::result react(const FlushedEvt&);
1858 boost::statechart::result react(const IntervalFlush&);
1859 boost::statechart::result react(const boost::statechart::event_base&) {
1860 return discard_event();
1861 }
1862 };
1863
7c673cae
FG
1864 struct Primary;
1865 struct Stray;
1866
1867 struct Start : boost::statechart::state< Start, Started >, NamedState {
1868 explicit Start(my_context ctx);
1869 void exit();
1870
1871 typedef boost::mpl::list <
1872 boost::statechart::transition< MakePrimary, Primary >,
1873 boost::statechart::transition< MakeStray, Stray >
1874 > reactions;
1875 };
1876
1877 struct Peering;
1878 struct WaitActingChange;
7c673cae 1879 struct Incomplete;
7c673cae 1880 struct Down;
7c673cae
FG
1881
1882 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1883 explicit Primary(my_context ctx);
1884 void exit();
1885
1886 typedef boost::mpl::list <
1887 boost::statechart::custom_reaction< ActMap >,
1888 boost::statechart::custom_reaction< MNotifyRec >,
1889 boost::statechart::transition< NeedActingChange, WaitActingChange >
1890 > reactions;
1891 boost::statechart::result react(const ActMap&);
1892 boost::statechart::result react(const MNotifyRec&);
1893 };
1894
1895 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1896 NamedState {
1897 typedef boost::mpl::list <
1898 boost::statechart::custom_reaction< QueryState >,
1899 boost::statechart::custom_reaction< AdvMap >,
1900 boost::statechart::custom_reaction< MLogRec >,
1901 boost::statechart::custom_reaction< MInfoRec >,
1902 boost::statechart::custom_reaction< MNotifyRec >
1903 > reactions;
1904 explicit WaitActingChange(my_context ctx);
1905 boost::statechart::result react(const QueryState& q);
1906 boost::statechart::result react(const AdvMap&);
1907 boost::statechart::result react(const MLogRec&);
1908 boost::statechart::result react(const MInfoRec&);
1909 boost::statechart::result react(const MNotifyRec&);
1910 void exit();
1911 };
1912
1913 struct GetInfo;
1914 struct Active;
1915
1916 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1917 PastIntervals::PriorSet prior_set;
1918 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1919
1920 explicit Peering(my_context ctx);
1921 void exit();
1922
1923 typedef boost::mpl::list <
1924 boost::statechart::custom_reaction< QueryState >,
1925 boost::statechart::transition< Activate, Active >,
1926 boost::statechart::custom_reaction< AdvMap >
1927 > reactions;
1928 boost::statechart::result react(const QueryState& q);
1929 boost::statechart::result react(const AdvMap &advmap);
1930 };
1931
1932 struct WaitLocalRecoveryReserved;
1933 struct Activating;
1934 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1935 explicit Active(my_context ctx);
1936 void exit();
1937
1938 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1939 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1940 bool all_replicas_activated;
1941
1942 typedef boost::mpl::list <
1943 boost::statechart::custom_reaction< QueryState >,
1944 boost::statechart::custom_reaction< ActMap >,
1945 boost::statechart::custom_reaction< AdvMap >,
1946 boost::statechart::custom_reaction< MInfoRec >,
1947 boost::statechart::custom_reaction< MNotifyRec >,
1948 boost::statechart::custom_reaction< MLogRec >,
1949 boost::statechart::custom_reaction< Backfilled >,
3efd9988
FG
1950 boost::statechart::custom_reaction< AllReplicasActivated >,
1951 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
1952 boost::statechart::custom_reaction< DeferBackfill >,
1953 boost::statechart::custom_reaction< UnfoundRecovery >,
1954 boost::statechart::custom_reaction< UnfoundBackfill >,
1955 boost::statechart::custom_reaction< DoRecovery>
7c673cae
FG
1956 > reactions;
1957 boost::statechart::result react(const QueryState& q);
1958 boost::statechart::result react(const ActMap&);
1959 boost::statechart::result react(const AdvMap&);
1960 boost::statechart::result react(const MInfoRec& infoevt);
1961 boost::statechart::result react(const MNotifyRec& notevt);
1962 boost::statechart::result react(const MLogRec& logevt);
1963 boost::statechart::result react(const Backfilled&) {
1964 return discard_event();
1965 }
1966 boost::statechart::result react(const AllReplicasActivated&);
3efd9988
FG
1967 boost::statechart::result react(const DeferRecovery& evt) {
1968 return discard_event();
1969 }
1970 boost::statechart::result react(const DeferBackfill& evt) {
1971 return discard_event();
1972 }
b32b8144
FG
1973 boost::statechart::result react(const UnfoundRecovery& evt) {
1974 return discard_event();
1975 }
1976 boost::statechart::result react(const UnfoundBackfill& evt) {
1977 return discard_event();
1978 }
1979 boost::statechart::result react(const DoRecovery&) {
1980 return discard_event();
1981 }
7c673cae
FG
1982 };
1983
1984 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1985 typedef boost::mpl::list<
1986 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1987 > reactions;
1988 explicit Clean(my_context ctx);
1989 void exit();
1990 };
1991
1992 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1993 typedef boost::mpl::list<
1994 boost::statechart::transition< GoClean, Clean >,
224ce89b 1995 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
7c673cae
FG
1996 boost::statechart::custom_reaction< AllReplicasActivated >
1997 > reactions;
1998 explicit Recovered(my_context ctx);
1999 void exit();
2000 boost::statechart::result react(const AllReplicasActivated&) {
2001 post_event(GoClean());
2002 return forward_event();
2003 }
2004 };
2005
2006 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2007 typedef boost::mpl::list<
2008 boost::statechart::transition< Backfilled, Recovered >,
3efd9988 2009 boost::statechart::custom_reaction< DeferBackfill >,
b32b8144 2010 boost::statechart::custom_reaction< UnfoundBackfill >,
7c673cae
FG
2011 boost::statechart::custom_reaction< RemoteReservationRejected >
2012 > reactions;
2013 explicit Backfilling(my_context ctx);
2014 boost::statechart::result react(const RemoteReservationRejected& evt);
3efd9988 2015 boost::statechart::result react(const DeferBackfill& evt);
b32b8144 2016 boost::statechart::result react(const UnfoundBackfill& evt);
7c673cae
FG
2017 void exit();
2018 };
2019
2020 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2021 typedef boost::mpl::list<
2022 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2023 boost::statechart::custom_reaction< RemoteReservationRejected >,
2024 boost::statechart::transition< AllBackfillsReserved, Backfilling >
2025 > reactions;
2026 set<pg_shard_t>::const_iterator backfill_osd_it;
2027 explicit WaitRemoteBackfillReserved(my_context ctx);
2028 void exit();
2029 boost::statechart::result react(const RemoteBackfillReserved& evt);
2030 boost::statechart::result react(const RemoteReservationRejected& evt);
2031 };
2032
2033 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2034 typedef boost::mpl::list<
2035 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2036 > reactions;
2037 explicit WaitLocalBackfillReserved(my_context ctx);
2038 void exit();
2039 };
2040
2041 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2042 typedef boost::mpl::list<
2043 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2044 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2045 boost::statechart::custom_reaction< RemoteReservationRejected >
2046 > reactions;
2047 explicit NotBackfilling(my_context ctx);
2048 void exit();
2049 boost::statechart::result react(const RemoteBackfillReserved& evt);
2050 boost::statechart::result react(const RemoteReservationRejected& evt);
2051 };
2052
2053 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2054 typedef boost::mpl::list<
c07f9fc5 2055 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
b32b8144
FG
2056 boost::statechart::custom_reaction< DeferRecovery >,
2057 boost::statechart::custom_reaction< UnfoundRecovery >
7c673cae
FG
2058 > reactions;
2059 explicit NotRecovering(my_context ctx);
3efd9988 2060 boost::statechart::result react(const DeferRecovery& evt) {
c07f9fc5
FG
2061 /* no-op */
2062 return discard_event();
2063 }
b32b8144
FG
2064 boost::statechart::result react(const UnfoundRecovery& evt) {
2065 /* no-op */
2066 return discard_event();
2067 }
7c673cae
FG
2068 void exit();
2069 };
2070
2071 struct RepNotRecovering;
2072 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2073 explicit ReplicaActive(my_context ctx);
2074 void exit();
2075
2076 typedef boost::mpl::list <
2077 boost::statechart::custom_reaction< QueryState >,
2078 boost::statechart::custom_reaction< ActMap >,
2079 boost::statechart::custom_reaction< MQuery >,
2080 boost::statechart::custom_reaction< MInfoRec >,
2081 boost::statechart::custom_reaction< MLogRec >,
3efd9988
FG
2082 boost::statechart::custom_reaction< Activate >,
2083 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144
FG
2084 boost::statechart::custom_reaction< DeferBackfill >,
2085 boost::statechart::custom_reaction< UnfoundRecovery >,
2086 boost::statechart::custom_reaction< UnfoundBackfill >
7c673cae
FG
2087 > reactions;
2088 boost::statechart::result react(const QueryState& q);
2089 boost::statechart::result react(const MInfoRec& infoevt);
2090 boost::statechart::result react(const MLogRec& logevt);
2091 boost::statechart::result react(const ActMap&);
2092 boost::statechart::result react(const MQuery&);
2093 boost::statechart::result react(const Activate&);
3efd9988
FG
2094 boost::statechart::result react(const DeferRecovery& evt) {
2095 return discard_event();
2096 }
2097 boost::statechart::result react(const DeferBackfill& evt) {
2098 return discard_event();
2099 }
b32b8144
FG
2100 boost::statechart::result react(const UnfoundRecovery& evt) {
2101 return discard_event();
2102 }
2103 boost::statechart::result react(const UnfoundBackfill& evt) {
2104 return discard_event();
2105 }
7c673cae
FG
2106 };
2107
2108 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2109 typedef boost::mpl::list<
2110 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
3efd9988 2111 // for compat with old peers
7c673cae 2112 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
3efd9988 2113 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
7c673cae
FG
2114 boost::statechart::custom_reaction< BackfillTooFull >
2115 > reactions;
2116 explicit RepRecovering(my_context ctx);
2117 boost::statechart::result react(const BackfillTooFull &evt);
2118 void exit();
2119 };
2120
2121 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2122 typedef boost::mpl::list<
2123 boost::statechart::custom_reaction< RemoteBackfillReserved >,
3efd9988
FG
2124 boost::statechart::custom_reaction< RejectRemoteReservation >,
2125 boost::statechart::custom_reaction< RemoteReservationRejected >,
2126 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2127 > reactions;
2128 explicit RepWaitBackfillReserved(my_context ctx);
2129 void exit();
2130 boost::statechart::result react(const RemoteBackfillReserved &evt);
3efd9988 2131 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae 2132 boost::statechart::result react(const RemoteReservationRejected &evt);
3efd9988 2133 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2134 };
2135
2136 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2137 typedef boost::mpl::list<
3efd9988
FG
2138 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2139 // for compat with old peers
2140 boost::statechart::custom_reaction< RemoteReservationRejected >,
2141 boost::statechart::custom_reaction< RemoteReservationCanceled >
7c673cae
FG
2142 > reactions;
2143 explicit RepWaitRecoveryReserved(my_context ctx);
2144 void exit();
2145 boost::statechart::result react(const RemoteRecoveryReserved &evt);
3efd9988
FG
2146 boost::statechart::result react(const RemoteReservationRejected &evt) {
2147 // for compat with old peers
2148 post_event(RemoteReservationCanceled());
2149 return discard_event();
2150 }
2151 boost::statechart::result react(const RemoteReservationCanceled &evt);
7c673cae
FG
2152 };
2153
2154 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2155 typedef boost::mpl::list<
2156 boost::statechart::custom_reaction< RequestBackfillPrio >,
2157 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
3efd9988
FG
2158 boost::statechart::custom_reaction< RejectRemoteReservation >,
2159 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2160 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
7c673cae
FG
2161 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2162 > reactions;
2163 explicit RepNotRecovering(my_context ctx);
2164 boost::statechart::result react(const RequestBackfillPrio &evt);
3efd9988 2165 boost::statechart::result react(const RejectRemoteReservation &evt);
7c673cae
FG
2166 void exit();
2167 };
2168
2169 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2170 typedef boost::mpl::list <
2171 boost::statechart::custom_reaction< AllReplicasRecovered >,
3efd9988 2172 boost::statechart::custom_reaction< DeferRecovery >,
b32b8144 2173 boost::statechart::custom_reaction< UnfoundRecovery >,
7c673cae
FG
2174 boost::statechart::custom_reaction< RequestBackfill >
2175 > reactions;
2176 explicit Recovering(my_context ctx);
2177 void exit();
224ce89b 2178 void release_reservations(bool cancel = false);
7c673cae 2179 boost::statechart::result react(const AllReplicasRecovered &evt);
3efd9988 2180 boost::statechart::result react(const DeferRecovery& evt);
b32b8144 2181 boost::statechart::result react(const UnfoundRecovery& evt);
7c673cae
FG
2182 boost::statechart::result react(const RequestBackfill &evt);
2183 };
2184
2185 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2186 typedef boost::mpl::list <
2187 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2188 boost::statechart::transition< AllRemotesReserved, Recovering >
2189 > reactions;
2190 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2191 explicit WaitRemoteRecoveryReserved(my_context ctx);
2192 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2193 void exit();
2194 };
2195
2196 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2197 typedef boost::mpl::list <
2198 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2199 boost::statechart::custom_reaction< RecoveryTooFull >
2200 > reactions;
2201 explicit WaitLocalRecoveryReserved(my_context ctx);
2202 void exit();
2203 boost::statechart::result react(const RecoveryTooFull &evt);
2204 };
2205
2206 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2207 typedef boost::mpl::list <
2208 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2209 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2210 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2211 > reactions;
2212 explicit Activating(my_context ctx);
2213 void exit();
2214 };
2215
2216 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2217 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2218
2219 explicit Stray(my_context ctx);
2220 void exit();
2221
2222 typedef boost::mpl::list <
2223 boost::statechart::custom_reaction< MQuery >,
2224 boost::statechart::custom_reaction< MLogRec >,
2225 boost::statechart::custom_reaction< MInfoRec >,
2226 boost::statechart::custom_reaction< ActMap >,
2227 boost::statechart::custom_reaction< RecoveryDone >
2228 > reactions;
2229 boost::statechart::result react(const MQuery& query);
2230 boost::statechart::result react(const MLogRec& logevt);
2231 boost::statechart::result react(const MInfoRec& infoevt);
2232 boost::statechart::result react(const ActMap&);
2233 boost::statechart::result react(const RecoveryDone&) {
2234 return discard_event();
2235 }
2236 };
2237
2238 struct GetLog;
2239
2240 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2241 set<pg_shard_t> peer_info_requested;
2242
2243 explicit GetInfo(my_context ctx);
2244 void exit();
2245 void get_infos();
2246
2247 typedef boost::mpl::list <
2248 boost::statechart::custom_reaction< QueryState >,
2249 boost::statechart::transition< GotInfo, GetLog >,
2250 boost::statechart::custom_reaction< MNotifyRec >,
2251 boost::statechart::transition< IsDown, Down >
2252 > reactions;
2253 boost::statechart::result react(const QueryState& q);
2254 boost::statechart::result react(const MNotifyRec& infoevt);
2255 };
2256
2257 struct GotLog : boost::statechart::event< GotLog > {
2258 GotLog() : boost::statechart::event< GotLog >() {}
2259 };
2260
2261 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2262 pg_shard_t auth_log_shard;
2263 boost::intrusive_ptr<MOSDPGLog> msg;
2264
2265 explicit GetLog(my_context ctx);
2266 void exit();
2267
2268 typedef boost::mpl::list <
2269 boost::statechart::custom_reaction< QueryState >,
2270 boost::statechart::custom_reaction< MLogRec >,
2271 boost::statechart::custom_reaction< GotLog >,
2272 boost::statechart::custom_reaction< AdvMap >,
2273 boost::statechart::transition< IsIncomplete, Incomplete >
2274 > reactions;
2275 boost::statechart::result react(const AdvMap&);
2276 boost::statechart::result react(const QueryState& q);
2277 boost::statechart::result react(const MLogRec& logevt);
2278 boost::statechart::result react(const GotLog&);
2279 };
2280
2281 struct WaitUpThru;
2282
2283 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2284 set<pg_shard_t> peer_missing_requested;
2285
2286 explicit GetMissing(my_context ctx);
2287 void exit();
2288
2289 typedef boost::mpl::list <
2290 boost::statechart::custom_reaction< QueryState >,
2291 boost::statechart::custom_reaction< MLogRec >,
2292 boost::statechart::transition< NeedUpThru, WaitUpThru >
2293 > reactions;
2294 boost::statechart::result react(const QueryState& q);
2295 boost::statechart::result react(const MLogRec& logevt);
2296 };
2297
2298 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2299 explicit WaitUpThru(my_context ctx);
2300 void exit();
2301
2302 typedef boost::mpl::list <
2303 boost::statechart::custom_reaction< QueryState >,
2304 boost::statechart::custom_reaction< ActMap >,
2305 boost::statechart::custom_reaction< MLogRec >
2306 > reactions;
2307 boost::statechart::result react(const QueryState& q);
2308 boost::statechart::result react(const ActMap& am);
2309 boost::statechart::result react(const MLogRec& logrec);
2310 };
2311
2312 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2313 explicit Down(my_context ctx);
2314 typedef boost::mpl::list <
2315 boost::statechart::custom_reaction< QueryState >
2316 > reactions;
2317 boost::statechart::result react(const QueryState& infoevt);
2318 void exit();
2319 };
2320
2321 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2322 typedef boost::mpl::list <
2323 boost::statechart::custom_reaction< AdvMap >,
2324 boost::statechart::custom_reaction< MNotifyRec >,
2325 boost::statechart::custom_reaction< QueryState >
2326 > reactions;
2327 explicit Incomplete(my_context ctx);
2328 boost::statechart::result react(const AdvMap &advmap);
2329 boost::statechart::result react(const MNotifyRec& infoevt);
2330 boost::statechart::result react(const QueryState& infoevt);
2331 void exit();
2332 };
2333
2334
2335 RecoveryMachine machine;
2336 PG *pg;
2337
2338 /// context passed in by state machine caller
2339 RecoveryCtx *orig_ctx;
2340
2341 /// populated if we are buffering messages pending a flush
2342 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2343
2344 /**
2345 * populated between start_handle() and end_handle(), points into
2346 * the message lists for messages_pending_flush while blocking messages
2347 * or into orig_ctx otherwise
2348 */
2349 boost::optional<RecoveryCtx> rctx;
2350
2351 public:
2352 explicit RecoveryState(PG *pg)
2353 : machine(this, pg), pg(pg), orig_ctx(0) {
2354 machine.initiate();
2355 }
2356
2357 void handle_event(const boost::statechart::event_base &evt,
2358 RecoveryCtx *rctx) {
2359 start_handle(rctx);
2360 machine.process_event(evt);
2361 end_handle();
2362 }
2363
2364 void handle_event(CephPeeringEvtRef evt,
2365 RecoveryCtx *rctx) {
2366 start_handle(rctx);
2367 machine.process_event(evt->get_event());
2368 end_handle();
2369 }
2370
2371 } recovery_state;
2372
2373
2374 public:
2375 PG(OSDService *o, OSDMapRef curmap,
2376 const PGPool &pool, spg_t p);
2377 ~PG() override;
2378
2379 private:
2380 // Prevent copying
2381 explicit PG(const PG& rhs);
2382 PG& operator=(const PG& rhs);
2383 const spg_t pg_id;
2384 uint64_t peer_features;
2385 uint64_t acting_features;
2386 uint64_t upacting_features;
2387
2388 epoch_t last_epoch;
2389
7c673cae
FG
2390 public:
2391 const spg_t& get_pgid() const { return pg_id; }
2392
2393 void reset_min_peer_features() {
2394 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2395 }
2396 uint64_t get_min_peer_features() const { return peer_features; }
2397 void apply_peer_features(uint64_t f) { peer_features &= f; }
2398
2399 uint64_t get_min_acting_features() const { return acting_features; }
2400 uint64_t get_min_upacting_features() const { return upacting_features; }
c07f9fc5
FG
2401 bool perform_deletes_during_peering() const {
2402 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2403 }
7c673cae
FG
2404
2405 void init_primary_up_acting(
2406 const vector<int> &newup,
2407 const vector<int> &newacting,
2408 int new_up_primary,
2409 int new_acting_primary) {
2410 actingset.clear();
2411 acting = newacting;
2412 for (uint8_t i = 0; i < acting.size(); ++i) {
2413 if (acting[i] != CRUSH_ITEM_NONE)
2414 actingset.insert(
2415 pg_shard_t(
2416 acting[i],
2417 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2418 }
2419 upset.clear();
2420 up = newup;
2421 for (uint8_t i = 0; i < up.size(); ++i) {
2422 if (up[i] != CRUSH_ITEM_NONE)
2423 upset.insert(
2424 pg_shard_t(
2425 up[i],
2426 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2427 }
2428 if (!pool.info.ec_pool()) {
2429 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2430 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2431 return;
2432 }
2433 up_primary = pg_shard_t();
2434 primary = pg_shard_t();
2435 for (uint8_t i = 0; i < up.size(); ++i) {
2436 if (up[i] == new_up_primary) {
2437 up_primary = pg_shard_t(up[i], shard_id_t(i));
2438 break;
2439 }
2440 }
2441 for (uint8_t i = 0; i < acting.size(); ++i) {
2442 if (acting[i] == new_acting_primary) {
2443 primary = pg_shard_t(acting[i], shard_id_t(i));
2444 break;
2445 }
2446 }
2447 assert(up_primary.osd == new_up_primary);
2448 assert(primary.osd == new_acting_primary);
2449 }
2450 pg_shard_t get_primary() const { return primary; }
2451
2452 int get_role() const { return role; }
2453 void set_role(int r) { role = r; }
2454
2455 bool is_primary() const { return pg_whoami == primary; }
2456 bool is_replica() const { return role > 0; }
2457
2458 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2459
2460 //int get_state() const { return state; }
2461 bool state_test(int m) const { return (state & m) != 0; }
2462 void state_set(int m) { state |= m; }
2463 void state_clear(int m) { state &= ~m; }
2464
2465 bool is_complete() const { return info.last_complete == info.last_update; }
2466 bool should_send_notify() const { return send_notify; }
2467
2468 int get_state() const { return state; }
2469 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2470 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2471 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2472 bool is_down() const { return state_test(PG_STATE_DOWN); }
b32b8144
FG
2473 bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2474 bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
7c673cae
FG
2475 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2476 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2477 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2478 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2479
2480 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
b32b8144 2481 bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
7c673cae
FG
2482 bool is_peered() const {
2483 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2484 }
2485
2486 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2487
2488 void init(
2489 int role,
2490 const vector<int>& up,
2491 int up_primary,
2492 const vector<int>& acting,
2493 int acting_primary,
2494 const pg_history_t& history,
2495 const PastIntervals& pim,
2496 bool backfill,
2497 ObjectStore::Transaction *t);
2498
2499 // pg on-disk state
2500 void do_pending_flush();
2501
2502 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2503 static void _init(ObjectStore::Transaction& t,
2504 spg_t pgid, const pg_pool_t *pool);
2505
2506private:
2507 void prepare_write_info(map<string,bufferlist> *km);
2508
2509 void update_store_with_options();
2510 void update_store_on_load();
2511
2512public:
2513 static int _prepare_write_info(
2514 CephContext* cct,
2515 map<string,bufferlist> *km,
2516 epoch_t epoch,
2517 pg_info_t &info,
2518 pg_info_t &last_written_info,
2519 PastIntervals &past_intervals,
2520 bool dirty_big_info,
2521 bool dirty_epoch,
2522 bool try_fast_info,
2523 PerfCounters *logger = nullptr);
2524 void write_if_dirty(ObjectStore::Transaction& t);
2525
2526 PGLog::IndexedLog projected_log;
2527 bool check_in_progress_op(
2528 const osd_reqid_t &r,
2529 eversion_t *version,
2530 version_t *user_version,
2531 int *return_code) const;
2532 eversion_t projected_last_update;
2533 eversion_t get_next_version() const {
2534 eversion_t at_version(
2535 get_osdmap()->get_epoch(),
2536 projected_last_update.version+1);
2537 assert(at_version > info.last_update);
2538 assert(at_version > pg_log.get_head());
2539 assert(at_version > projected_last_update);
2540 return at_version;
2541 }
2542
2543 void add_log_entry(const pg_log_entry_t& e, bool applied);
2544 void append_log(
2545 const vector<pg_log_entry_t>& logv,
2546 eversion_t trim_to,
2547 eversion_t roll_forward_to,
2548 ObjectStore::Transaction &t,
2549 bool transaction_applied = true);
2550 bool check_log_for_corruption(ObjectStore *store);
2551 void trim_log();
2552
2553 std::string get_corrupt_pg_log_name() const;
2554 static int read_info(
2555 ObjectStore *store, spg_t pgid, const coll_t &coll,
2556 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2557 __u8 &);
2558 void read_state(ObjectStore *store, bufferlist &bl);
2559 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2560 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2561 epoch_t *pepoch, bufferlist *bl);
2562 void update_snap_map(
2563 const vector<pg_log_entry_t> &log_entries,
2564 ObjectStore::Transaction& t);
2565
2566 void filter_snapc(vector<snapid_t> &snaps);
2567
2568 void log_weirdness();
2569
2570 virtual void kick_snap_trim() = 0;
2571 virtual void snap_trimmer_scrub_complete() = 0;
31f18b77 2572 bool requeue_scrub(bool high_priority = false);
c07f9fc5 2573 void queue_recovery();
7c673cae
FG
2574 bool queue_scrub();
2575 unsigned get_scrub_priority();
2576
2577 /// share pg info after a pg is active
2578 void share_pg_info();
2579
2580
2581 bool append_log_entries_update_missing(
31f18b77 2582 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
2583 ObjectStore::Transaction &t,
2584 boost::optional<eversion_t> trim_to,
2585 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
2586
2587 /**
2588 * Merge entries updating missing as necessary on all
2589 * actingbackfill logs and missings (also missing_loc)
2590 */
2591 void merge_new_log_entries(
31f18b77 2592 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
94b18763
FG
2593 ObjectStore::Transaction &t,
2594 boost::optional<eversion_t> trim_to,
2595 boost::optional<eversion_t> roll_forward_to);
7c673cae
FG
2596
2597 void reset_interval_flush();
2598 void start_peering_interval(
2599 const OSDMapRef lastmap,
2600 const vector<int>& newup, int up_primary,
2601 const vector<int>& newacting, int acting_primary,
2602 ObjectStore::Transaction *t);
2603 void on_new_interval();
2604 virtual void _on_new_interval() = 0;
2605 void start_flush(ObjectStore::Transaction *t,
2606 list<Context *> *on_applied,
2607 list<Context *> *on_safe);
2608 void set_last_peering_reset();
2609 bool pg_has_reset_since(epoch_t e) {
2610 assert(is_locked());
2611 return deleting || e < get_last_peering_reset();
2612 }
2613
2614 void update_history(const pg_history_t& history);
2615 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2616 pair<pg_shard_t, pg_info_t> &notify_info);
2617 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2618
2619 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2620
2621 bool should_restart_peering(
2622 int newupprimary,
2623 int newactingprimary,
2624 const vector<int>& newup,
2625 const vector<int>& newacting,
2626 OSDMapRef lastmap,
2627 OSDMapRef osdmap);
2628
2629 // OpRequest queueing
2630 bool can_discard_op(OpRequestRef& op);
2631 bool can_discard_scan(OpRequestRef op);
2632 bool can_discard_backfill(OpRequestRef op);
2633 bool can_discard_request(OpRequestRef& op);
2634
2635 template<typename T, int MSGTYPE>
2636 bool can_discard_replica_op(OpRequestRef& op);
2637
2638 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2639 bool old_peering_evt(CephPeeringEvtRef evt) {
2640 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2641 }
2642 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2643 return e <= cur_epoch;
2644 }
2645 bool have_same_or_newer_map(epoch_t e) {
2646 return e <= get_osdmap()->get_epoch();
2647 }
2648
2649 bool op_has_sufficient_caps(OpRequestRef& op);
2650
2651
2652 // recovery bits
2653 void take_waiters();
2654 void queue_peering_event(CephPeeringEvtRef evt);
2655 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2656 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2657 pg_shard_t from, const pg_query_t& q);
2658 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2659 void queue_flushed(epoch_t started_at);
2660 void handle_advance_map(
2661 OSDMapRef osdmap, OSDMapRef lastmap,
2662 vector<int>& newup, int up_primary,
2663 vector<int>& newacting, int acting_primary,
2664 RecoveryCtx *rctx);
2665 void handle_activate_map(RecoveryCtx *rctx);
2666 void handle_create(RecoveryCtx *rctx);
2667 void handle_loaded(RecoveryCtx *rctx);
2668 void handle_query_state(Formatter *f);
2669
2670 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2671
2672
2673 // abstract bits
2674 virtual void do_request(
2675 OpRequestRef& op,
2676 ThreadPool::TPHandle &handle
2677 ) = 0;
2678
2679 virtual void do_op(OpRequestRef& op) = 0;
2680 virtual void do_sub_op(OpRequestRef op) = 0;
2681 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2682 virtual void do_scan(
2683 OpRequestRef op,
2684 ThreadPool::TPHandle &handle
2685 ) = 0;
2686 virtual void do_backfill(OpRequestRef op) = 0;
2687 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2688
2689 virtual int do_command(
2690 cmdmap_t cmdmap,
2691 ostream& ss,
2692 bufferlist& idata,
2693 bufferlist& odata,
2694 ConnectionRef conn,
2695 ceph_tid_t tid) = 0;
2696
2697 virtual void on_role_change() = 0;
2698 virtual void on_pool_change() = 0;
2699 virtual void on_change(ObjectStore::Transaction *t) = 0;
2700 virtual void on_activate() = 0;
2701 virtual void on_flushed() = 0;
2702 virtual void on_shutdown() = 0;
2703 virtual void check_blacklisted_watchers() = 0;
2704 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2705
2706 virtual bool agent_work(int max) = 0;
2707 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2708 virtual void agent_stop() = 0;
2709 virtual void agent_delay() = 0;
2710 virtual void agent_clear() = 0;
2711 virtual void agent_choose_mode_restart() = 0;
2712};
2713
2714ostream& operator<<(ostream& out, const PG& pg);
2715
2716ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
2717
2718#endif