]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
import ceph nautilus 14.2.2
[ceph.git] / ceph / src / osd / PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef PGBACKEND_H
19 #define PGBACKEND_H
20
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
26 #include <string>
27 #include "PGTransaction.h"
28
29 namespace Scrub {
30 class Store;
31 }
32 struct shard_info_wrapper;
33 struct inconsistent_obj_wrapper;
34
35 //forward declaration
36 class OSDMap;
37 class PGLog;
38 typedef std::shared_ptr<const OSDMap> OSDMapRef;
39
40 /**
41 * PGBackend
42 *
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
45 * is responsible for:
46 *
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
51 */
52 class PGBackend {
53 public:
54 CephContext* cct;
55 protected:
56 ObjectStore *store;
57 const coll_t coll;
58 ObjectStore::CollectionHandle &ch;
59 public:
60 /**
61 * Provides interfaces for PGBackend callbacks
62 *
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
66 */
67 class Listener {
68 public:
69 /// Debugging
70 virtual DoutPrefixProvider *get_dpp() = 0;
71
72 /// Recovery
73
74 /**
75 * Called with the transaction recovering oid
76 */
77 virtual void on_local_recover(
78 const hobject_t &oid,
79 const ObjectRecoveryInfo &recovery_info,
80 ObjectContextRef obc,
81 bool is_delete,
82 ObjectStore::Transaction *t
83 ) = 0;
84
85 /**
86 * Called when transaction recovering oid is durable and
87 * applied on all replicas
88 */
89 virtual void on_global_recover(
90 const hobject_t &oid,
91 const object_stat_sum_t &stat_diff,
92 bool is_delete
93 ) = 0;
94
95 /**
96 * Called when peer is recovered
97 */
98 virtual void on_peer_recover(
99 pg_shard_t peer,
100 const hobject_t &oid,
101 const ObjectRecoveryInfo &recovery_info
102 ) = 0;
103
104 virtual void begin_peer_recover(
105 pg_shard_t peer,
106 const hobject_t oid) = 0;
107
108 virtual void failed_push(const list<pg_shard_t> &from,
109 const hobject_t &soid,
110 const eversion_t &need = eversion_t()) = 0;
111 virtual void finish_degraded_object(const hobject_t& oid) = 0;
112 virtual void primary_failed(const hobject_t &soid) = 0;
113 virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
114 virtual void cancel_pull(const hobject_t &soid) = 0;
115
116 virtual void apply_stats(
117 const hobject_t &soid,
118 const object_stat_sum_t &delta_stats) = 0;
119
120 /**
121 * Called when a read on the primary fails when pushing
122 */
123 virtual void on_primary_error(
124 const hobject_t &oid,
125 eversion_t v
126 ) = 0;
127
128 virtual void backfill_add_missing(
129 const hobject_t &oid,
130 eversion_t v
131 ) = 0;
132
133 virtual void remove_missing_object(const hobject_t &oid,
134 eversion_t v,
135 Context *on_complete) = 0;
136
137
138 /**
139 * Bless a context
140 *
141 * Wraps a context in whatever outer layers the parent usually
142 * uses to call into the PGBackend
143 */
144 virtual Context *bless_context(Context *c) = 0;
145 virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
146 GenContext<ThreadPool::TPHandle&> *c) = 0;
147 virtual GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
148 GenContext<ThreadPool::TPHandle&> *c) = 0;
149
150 virtual void send_message(int to_osd, Message *m) = 0;
151 virtual void queue_transaction(
152 ObjectStore::Transaction&& t,
153 OpRequestRef op = OpRequestRef()
154 ) = 0;
155 virtual void queue_transactions(
156 vector<ObjectStore::Transaction>& tls,
157 OpRequestRef op = OpRequestRef()
158 ) = 0;
159 virtual epoch_t get_interval_start_epoch() const = 0;
160 virtual epoch_t get_last_peering_reset_epoch() const = 0;
161
162 virtual const set<pg_shard_t> &get_acting_recovery_backfill_shards() const = 0;
163 virtual const set<pg_shard_t> &get_acting_shards() const = 0;
164 virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
165
166 virtual std::ostream& gen_dbg_prefix(std::ostream& out) const = 0;
167
168 virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
169 const = 0;
170
171 virtual const pg_missing_tracker_t &get_local_missing() const = 0;
172 virtual void add_local_next_event(const pg_log_entry_t& e) = 0;
173 virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
174 const = 0;
175 virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
176 pg_shard_t peer) const {
177 if (peer == primary_shard()) {
178 return get_local_missing();
179 } else {
180 map<pg_shard_t, pg_missing_t>::const_iterator i =
181 get_shard_missing().find(peer);
182 if (i == get_shard_missing().end()) {
183 return boost::optional<const pg_missing_const_i &>();
184 } else {
185 return i->second;
186 }
187 }
188 }
189 virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
190 auto m = maybe_get_shard_missing(peer);
191 ceph_assert(m);
192 return *m;
193 }
194
195 virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
196 virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
197 if (peer == primary_shard()) {
198 return get_info();
199 } else {
200 map<pg_shard_t, pg_info_t>::const_iterator i =
201 get_shard_info().find(peer);
202 ceph_assert(i != get_shard_info().end());
203 return i->second;
204 }
205 }
206
207 virtual const PGLog &get_log() const = 0;
208 virtual bool pgb_is_primary() const = 0;
209 virtual const OSDMapRef& pgb_get_osdmap() const = 0;
210 virtual epoch_t pgb_get_osdmap_epoch() const = 0;
211 virtual const pg_info_t &get_info() const = 0;
212 virtual const pg_pool_t &get_pool() const = 0;
213
214 virtual ObjectContextRef get_obc(
215 const hobject_t &hoid,
216 const map<string, bufferlist> &attrs) = 0;
217
218 virtual bool try_lock_for_read(
219 const hobject_t &hoid,
220 ObcLockManager &manager) = 0;
221
222 virtual void release_locks(ObcLockManager &manager) = 0;
223
224 virtual void op_applied(
225 const eversion_t &applied_version) = 0;
226
227 virtual bool should_send_op(
228 pg_shard_t peer,
229 const hobject_t &hoid) = 0;
230
231 virtual bool pg_is_undersized() const = 0;
232 virtual bool pg_is_repair() const = 0;
233
234 virtual void log_operation(
235 const vector<pg_log_entry_t> &logv,
236 const boost::optional<pg_hit_set_history_t> &hset_history,
237 const eversion_t &trim_to,
238 const eversion_t &roll_forward_to,
239 bool transaction_applied,
240 ObjectStore::Transaction &t,
241 bool async = false) = 0;
242
243 virtual void pgb_set_object_snap_mapping(
244 const hobject_t &soid,
245 const set<snapid_t> &snaps,
246 ObjectStore::Transaction *t) = 0;
247
248 virtual void pgb_clear_object_snap_mapping(
249 const hobject_t &soid,
250 ObjectStore::Transaction *t) = 0;
251
252 virtual void update_peer_last_complete_ondisk(
253 pg_shard_t fromosd,
254 eversion_t lcod) = 0;
255
256 virtual void update_last_complete_ondisk(
257 eversion_t lcod) = 0;
258
259 virtual void update_stats(
260 const pg_stat_t &stat) = 0;
261
262 virtual void schedule_recovery_work(
263 GenContext<ThreadPool::TPHandle&> *c) = 0;
264
265 virtual pg_shard_t whoami_shard() const = 0;
266 int whoami() const {
267 return whoami_shard().osd;
268 }
269 spg_t whoami_spg_t() const {
270 return get_info().pgid;
271 }
272
273 virtual spg_t primary_spg_t() const = 0;
274 virtual pg_shard_t primary_shard() const = 0;
275
276 virtual hobject_t get_temp_recovery_object(const hobject_t& target,
277 eversion_t version) = 0;
278
279 virtual void send_message_osd_cluster(
280 int peer, Message *m, epoch_t from_epoch) = 0;
281 virtual void send_message_osd_cluster(
282 Message *m, Connection *con) = 0;
283 virtual void send_message_osd_cluster(
284 Message *m, const ConnectionRef& con) = 0;
285 virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
286 virtual entity_name_t get_cluster_msgr_name() = 0;
287
288 virtual PerfCounters *get_logger() = 0;
289
290 virtual ceph_tid_t get_tid() = 0;
291
292 virtual LogClientTemp clog_error() = 0;
293 virtual LogClientTemp clog_warn() = 0;
294
295 virtual bool check_failsafe_full() = 0;
296
297 virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
298
299 virtual bool pg_is_repair() = 0;
300 virtual void inc_osd_stat_repaired() = 0;
301 virtual bool pg_is_remote_backfilling() = 0;
302 virtual void pg_add_local_num_bytes(int64_t num_bytes) = 0;
303 virtual void pg_sub_local_num_bytes(int64_t num_bytes) = 0;
304 virtual void pg_add_num_bytes(int64_t num_bytes) = 0;
305 virtual void pg_sub_num_bytes(int64_t num_bytes) = 0;
306 virtual bool maybe_preempt_replica_scrub(const hobject_t& oid) = 0;
307 virtual ~Listener() {}
308 };
309 Listener *parent;
310 Listener *get_parent() const { return parent; }
311 PGBackend(CephContext* cct, Listener *l, ObjectStore *store, const coll_t &coll,
312 ObjectStore::CollectionHandle &ch) :
313 cct(cct),
314 store(store),
315 coll(coll),
316 ch(ch),
317 parent(l) {}
318 bool is_primary() const { return get_parent()->pgb_is_primary(); }
319 const OSDMapRef& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
320 epoch_t get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
321 const pg_info_t &get_info() { return get_parent()->get_info(); }
322
323 std::ostream& gen_prefix(std::ostream& out) const {
324 return parent->gen_dbg_prefix(out);
325 }
326
327 /**
328 * RecoveryHandle
329 *
330 * We may want to recover multiple objects in the same set of
331 * messages. RecoveryHandle is an interface for the opaque
332 * object used by the implementation to store the details of
333 * the pending recovery operations.
334 */
335 struct RecoveryHandle {
336 bool cache_dont_need;
337 map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
338
339 RecoveryHandle(): cache_dont_need(false) {}
340 virtual ~RecoveryHandle() {}
341 };
342
343 /// Get a fresh recovery operation
344 virtual RecoveryHandle *open_recovery_op() = 0;
345
346 /// run_recovery_op: finish the operation represented by h
347 virtual void run_recovery_op(
348 RecoveryHandle *h, ///< [in] op to finish
349 int priority ///< [in] msg priority
350 ) = 0;
351
352 void recover_delete_object(const hobject_t &oid, eversion_t v,
353 RecoveryHandle *h);
354 void send_recovery_deletes(int prio,
355 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
356
357 /**
358 * recover_object
359 *
360 * Triggers a recovery operation on the specified hobject_t
361 * onreadable must be called before onwriteable
362 *
363 * On each replica (primary included), get_parent()->on_not_missing()
364 * must be called when the transaction finalizing the recovery
365 * is queued. Similarly, get_parent()->on_readable() must be called
366 * when the transaction is applied in the backing store.
367 *
368 * get_parent()->on_not_degraded() should be called on the primary
369 * when writes can resume on the object.
370 *
371 * obc may be NULL if the primary lacks the object.
372 *
373 * head may be NULL only if the head/snapdir is missing
374 *
375 * @param missing [in] set of info, missing pairs for queried nodes
376 * @param overlaps [in] mapping of object to file offset overlaps
377 */
378 virtual int recover_object(
379 const hobject_t &hoid, ///< [in] object to recover
380 eversion_t v, ///< [in] version to recover
381 ObjectContextRef head, ///< [in] context of the head/snapdir object
382 ObjectContextRef obc, ///< [in] context of the object
383 RecoveryHandle *h ///< [in,out] handle to attach recovery op to
384 ) = 0;
385
386 /**
387 * true if PGBackend can handle this message while inactive
388 *
389 * If it returns true, handle_message *must* also return true
390 */
391 virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
392
393 /// gives PGBackend a crack at an incoming message
394 bool handle_message(
395 OpRequestRef op ///< [in] message received
396 ); ///< @return true if the message was handled
397
398 /// the variant of handle_message that is overridden by child classes
399 virtual bool _handle_message(OpRequestRef op) = 0;
400
401 virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
402
403
404 /**
405 * clean up any temporary on-disk state due to a pg interval change
406 */
407 void on_change_cleanup(ObjectStore::Transaction *t);
408 /**
409 * implementation should clear itself, contexts blessed prior to on_change
410 * won't be called after on_change()
411 */
412 virtual void on_change() = 0;
413 virtual void clear_recovery_state() = 0;
414
415 virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() const = 0;
416 virtual IsPGReadablePredicate *get_is_readable_predicate() const = 0;
417 virtual int get_ec_data_chunk_count() const { return 0; };
418 virtual int get_ec_stripe_chunk_size() const { return 0; };
419
420 virtual void dump_recovery_info(Formatter *f) const = 0;
421
422 private:
423 set<hobject_t> temp_contents;
424 public:
425 // Track contents of temp collection, clear on reset
426 void add_temp_obj(const hobject_t &oid) {
427 temp_contents.insert(oid);
428 }
429 void add_temp_objs(const set<hobject_t> &oids) {
430 temp_contents.insert(oids.begin(), oids.end());
431 }
432 void clear_temp_obj(const hobject_t &oid) {
433 temp_contents.erase(oid);
434 }
435 void clear_temp_objs(const set<hobject_t> &oids) {
436 for (set<hobject_t>::const_iterator i = oids.begin();
437 i != oids.end();
438 ++i) {
439 temp_contents.erase(*i);
440 }
441 }
442
443 virtual ~PGBackend() {}
444
445 /// execute implementation specific transaction
446 virtual void submit_transaction(
447 const hobject_t &hoid, ///< [in] object
448 const object_stat_sum_t &delta_stats,///< [in] stat change
449 const eversion_t &at_version, ///< [in] version
450 PGTransactionUPtr &&t, ///< [in] trans to execute (move)
451 const eversion_t &trim_to, ///< [in] trim log to here
452 const eversion_t &roll_forward_to, ///< [in] trim rollback info to here
453 const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
454 /// [in] hitset history (if updated with this transaction)
455 boost::optional<pg_hit_set_history_t> &hset_history,
456 Context *on_all_commit, ///< [in] called when all commit
457 ceph_tid_t tid, ///< [in] tid
458 osd_reqid_t reqid, ///< [in] reqid
459 OpRequestRef op ///< [in] op
460 ) = 0;
461
462 /// submit callback to be called in order with pending writes
463 virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
464
465 void try_stash(
466 const hobject_t &hoid,
467 version_t v,
468 ObjectStore::Transaction *t);
469
470 void rollback(
471 const pg_log_entry_t &entry,
472 ObjectStore::Transaction *t);
473
474 friend class LRBTrimmer;
475 void rollforward(
476 const pg_log_entry_t &entry,
477 ObjectStore::Transaction *t);
478
479 void trim(
480 const pg_log_entry_t &entry,
481 ObjectStore::Transaction *t);
482
483 void remove(
484 const hobject_t &hoid,
485 ObjectStore::Transaction *t);
486
487 protected:
488
489 void handle_recovery_delete(OpRequestRef op);
490 void handle_recovery_delete_reply(OpRequestRef op);
491
492 /// Reapply old attributes
493 void rollback_setattrs(
494 const hobject_t &hoid,
495 map<string, boost::optional<bufferlist> > &old_attrs,
496 ObjectStore::Transaction *t);
497
498 /// Truncate object to rollback append
499 virtual void rollback_append(
500 const hobject_t &hoid,
501 uint64_t old_size,
502 ObjectStore::Transaction *t);
503
504 /// Unstash object to rollback stash
505 void rollback_stash(
506 const hobject_t &hoid,
507 version_t old_version,
508 ObjectStore::Transaction *t);
509
510 /// Unstash object to rollback stash
511 void rollback_try_stash(
512 const hobject_t &hoid,
513 version_t old_version,
514 ObjectStore::Transaction *t);
515
516 /// Delete object to rollback create
517 void rollback_create(
518 const hobject_t &hoid,
519 ObjectStore::Transaction *t) {
520 remove(hoid, t);
521 }
522
523 /// Clone the extents back into place
524 void rollback_extents(
525 version_t gen,
526 const vector<pair<uint64_t, uint64_t> > &extents,
527 const hobject_t &hoid,
528 ObjectStore::Transaction *t);
529 public:
530
531 /// Trim object stashed at version
532 void trim_rollback_object(
533 const hobject_t &hoid,
534 version_t gen,
535 ObjectStore::Transaction *t);
536
537 /// List objects in collection
538 int objects_list_partial(
539 const hobject_t &begin,
540 int min,
541 int max,
542 vector<hobject_t> *ls,
543 hobject_t *next);
544
545 int objects_list_range(
546 const hobject_t &start,
547 const hobject_t &end,
548 vector<hobject_t> *ls,
549 vector<ghobject_t> *gen_obs=0);
550
551 int objects_get_attr(
552 const hobject_t &hoid,
553 const string &attr,
554 bufferlist *out);
555
556 virtual int objects_get_attrs(
557 const hobject_t &hoid,
558 map<string, bufferlist> *out);
559
560 virtual int objects_read_sync(
561 const hobject_t &hoid,
562 uint64_t off,
563 uint64_t len,
564 uint32_t op_flags,
565 bufferlist *bl) = 0;
566
567 virtual void objects_read_async(
568 const hobject_t &hoid,
569 const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
570 pair<bufferlist*, Context*> > > &to_read,
571 Context *on_complete, bool fast_read = false) = 0;
572
573 virtual bool auto_repair_supported() const = 0;
574 int be_scan_list(
575 ScrubMap &map,
576 ScrubMapBuilder &pos);
577 bool be_compare_scrub_objects(
578 pg_shard_t auth_shard,
579 const ScrubMap::object &auth,
580 const object_info_t& auth_oi,
581 const ScrubMap::object &candidate,
582 shard_info_wrapper& shard_error,
583 inconsistent_obj_wrapper &result,
584 ostream &errorstream,
585 bool has_snapset);
586 map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
587 const hobject_t &obj,
588 const map<pg_shard_t,ScrubMap*> &maps,
589 object_info_t *auth_oi,
590 map<pg_shard_t, shard_info_wrapper> &shard_map,
591 bool &digest_match,
592 spg_t pgid,
593 ostream &errorstream);
594 void be_compare_scrubmaps(
595 const map<pg_shard_t,ScrubMap*> &maps,
596 const set<hobject_t> &master_set,
597 bool repair,
598 map<hobject_t, set<pg_shard_t>> &missing,
599 map<hobject_t, set<pg_shard_t>> &inconsistent,
600 map<hobject_t, list<pg_shard_t>> &authoritative,
601 map<hobject_t, pair<boost::optional<uint32_t>,
602 boost::optional<uint32_t>>> &missing_digest,
603 int &shallow_errors, int &deep_errors,
604 Scrub::Store *store,
605 const spg_t& pgid,
606 const vector<int> &acting,
607 ostream &errorstream);
608 virtual uint64_t be_get_ondisk_size(
609 uint64_t logical_size) = 0;
610 virtual int be_deep_scrub(
611 const hobject_t &oid,
612 ScrubMap &map,
613 ScrubMapBuilder &pos,
614 ScrubMap::object &o) = 0;
615 void be_omap_checks(
616 const map<pg_shard_t,ScrubMap*> &maps,
617 const set<hobject_t> &master_set,
618 omap_stat_t& omap_stats,
619 ostream &warnstream) const;
620
621 static PGBackend *build_pg_backend(
622 const pg_pool_t &pool,
623 const map<string,string>& profile,
624 Listener *l,
625 coll_t coll,
626 ObjectStore::CollectionHandle &ch,
627 ObjectStore *store,
628 CephContext *cct);
629 };
630
631 #endif