]> git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
update sources to v12.2.3
[ceph.git] / ceph / src / osd / PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
8 *
9 * Author: Loic Dachary <loic@dachary.org>
10 *
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
15 *
16 */
17
18 #ifndef PGBACKEND_H
19 #define PGBACKEND_H
20
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
26 #include <string>
27 #include "PGTransaction.h"
28
29 namespace Scrub {
30 class Store;
31 }
32 struct shard_info_wrapper;
33 struct inconsistent_obj_wrapper;
34
35 //forward declaration
36 class OSDMap;
37 class PGLog;
38 typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
39
40 /**
41 * PGBackend
42 *
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
45 * is responsible for:
46 *
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
51 */
52 class PGBackend {
53 public:
54 CephContext* cct;
55 protected:
56 ObjectStore *store;
57 const coll_t coll;
58 ObjectStore::CollectionHandle &ch;
59 public:
60 /**
61 * Provides interfaces for PGBackend callbacks
62 *
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
66 */
67 class Listener {
68 public:
69 /// Debugging
70 virtual DoutPrefixProvider *get_dpp() = 0;
71
72 /// Recovery
73
74 /**
75 * Called with the transaction recovering oid
76 */
77 virtual void on_local_recover(
78 const hobject_t &oid,
79 const ObjectRecoveryInfo &recovery_info,
80 ObjectContextRef obc,
81 bool is_delete,
82 ObjectStore::Transaction *t
83 ) = 0;
84
85 /**
86 * Called when transaction recovering oid is durable and
87 * applied on all replicas
88 */
89 virtual void on_global_recover(
90 const hobject_t &oid,
91 const object_stat_sum_t &stat_diff,
92 bool is_delete
93 ) = 0;
94
95 /**
96 * Called when peer is recovered
97 */
98 virtual void on_peer_recover(
99 pg_shard_t peer,
100 const hobject_t &oid,
101 const ObjectRecoveryInfo &recovery_info
102 ) = 0;
103
104 virtual void begin_peer_recover(
105 pg_shard_t peer,
106 const hobject_t oid) = 0;
107
108 virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
109 virtual void finish_degraded_object(const hobject_t& oid) = 0;
110 virtual void primary_failed(const hobject_t &soid) = 0;
111 virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
112 virtual void cancel_pull(const hobject_t &soid) = 0;
113
114 virtual void apply_stats(
115 const hobject_t &soid,
116 const object_stat_sum_t &delta_stats) = 0;
117
118 /**
119 * Called when a read on the primary fails when pushing
120 */
121 virtual void on_primary_error(
122 const hobject_t &oid,
123 eversion_t v
124 ) = 0;
125
126 virtual void backfill_add_missing(
127 const hobject_t &oid,
128 eversion_t v
129 ) = 0;
130
131 virtual void remove_missing_object(const hobject_t &oid,
132 eversion_t v,
133 Context *on_complete) = 0;
134
135 /**
136 * Bless a context
137 *
138 * Wraps a context in whatever outer layers the parent usually
139 * uses to call into the PGBackend
140 */
141 virtual Context *bless_context(Context *c) = 0;
142 virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
143 GenContext<ThreadPool::TPHandle&> *c) = 0;
144
145 virtual void send_message(int to_osd, Message *m) = 0;
146 virtual void queue_transaction(
147 ObjectStore::Transaction&& t,
148 OpRequestRef op = OpRequestRef()
149 ) = 0;
150 virtual void queue_transactions(
151 vector<ObjectStore::Transaction>& tls,
152 OpRequestRef op = OpRequestRef()
153 ) = 0;
154 virtual epoch_t get_epoch() const = 0;
155 virtual epoch_t get_interval_start_epoch() const = 0;
156 virtual epoch_t get_last_peering_reset_epoch() const = 0;
157
158 virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
159 virtual const set<pg_shard_t> &get_acting_shards() const = 0;
160 virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
161
162 virtual std::string gen_dbg_prefix() const = 0;
163
164 virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
165 const = 0;
166
167 virtual const pg_missing_tracker_t &get_local_missing() const = 0;
168 virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
169 const = 0;
170 virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
171 pg_shard_t peer) const {
172 if (peer == primary_shard()) {
173 return get_local_missing();
174 } else {
175 map<pg_shard_t, pg_missing_t>::const_iterator i =
176 get_shard_missing().find(peer);
177 if (i == get_shard_missing().end()) {
178 return boost::optional<const pg_missing_const_i &>();
179 } else {
180 return i->second;
181 }
182 }
183 }
184 virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
185 auto m = maybe_get_shard_missing(peer);
186 assert(m);
187 return *m;
188 }
189
190 virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
191 virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
192 if (peer == primary_shard()) {
193 return get_info();
194 } else {
195 map<pg_shard_t, pg_info_t>::const_iterator i =
196 get_shard_info().find(peer);
197 assert(i != get_shard_info().end());
198 return i->second;
199 }
200 }
201
202 virtual const PGLog &get_log() const = 0;
203 virtual bool pgb_is_primary() const = 0;
204 virtual OSDMapRef pgb_get_osdmap() const = 0;
205 virtual const pg_info_t &get_info() const = 0;
206 virtual const pg_pool_t &get_pool() const = 0;
207
208 virtual ObjectContextRef get_obc(
209 const hobject_t &hoid,
210 const map<string, bufferlist> &attrs) = 0;
211
212 virtual bool try_lock_for_read(
213 const hobject_t &hoid,
214 ObcLockManager &manager) = 0;
215
216 virtual void release_locks(ObcLockManager &manager) = 0;
217
218 virtual void op_applied(
219 const eversion_t &applied_version) = 0;
220
221 virtual bool should_send_op(
222 pg_shard_t peer,
223 const hobject_t &hoid) = 0;
224
225 virtual void log_operation(
226 const vector<pg_log_entry_t> &logv,
227 const boost::optional<pg_hit_set_history_t> &hset_history,
228 const eversion_t &trim_to,
229 const eversion_t &roll_forward_to,
230 bool transaction_applied,
231 ObjectStore::Transaction &t) = 0;
232
233 virtual void pgb_set_object_snap_mapping(
234 const hobject_t &soid,
235 const set<snapid_t> &snaps,
236 ObjectStore::Transaction *t) = 0;
237
238 virtual void pgb_clear_object_snap_mapping(
239 const hobject_t &soid,
240 ObjectStore::Transaction *t) = 0;
241
242 virtual void update_peer_last_complete_ondisk(
243 pg_shard_t fromosd,
244 eversion_t lcod) = 0;
245
246 virtual void update_last_complete_ondisk(
247 eversion_t lcod) = 0;
248
249 virtual void update_stats(
250 const pg_stat_t &stat) = 0;
251
252 virtual void schedule_recovery_work(
253 GenContext<ThreadPool::TPHandle&> *c) = 0;
254
255 virtual pg_shard_t whoami_shard() const = 0;
256 int whoami() const {
257 return whoami_shard().osd;
258 }
259 spg_t whoami_spg_t() const {
260 return get_info().pgid;
261 }
262
263 virtual spg_t primary_spg_t() const = 0;
264 virtual pg_shard_t primary_shard() const = 0;
265
266 virtual uint64_t min_peer_features() const = 0;
267
268 virtual hobject_t get_temp_recovery_object(const hobject_t& target,
269 eversion_t version) = 0;
270
271 virtual void send_message_osd_cluster(
272 int peer, Message *m, epoch_t from_epoch) = 0;
273 virtual void send_message_osd_cluster(
274 Message *m, Connection *con) = 0;
275 virtual void send_message_osd_cluster(
276 Message *m, const ConnectionRef& con) = 0;
277 virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
278 virtual entity_name_t get_cluster_msgr_name() = 0;
279
280 virtual PerfCounters *get_logger() = 0;
281
282 virtual ceph_tid_t get_tid() = 0;
283
284 virtual LogClientTemp clog_error() = 0;
285 virtual LogClientTemp clog_warn() = 0;
286
287 virtual bool check_failsafe_full(ostream &ss) = 0;
288
289 virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
290
291 virtual ~Listener() {}
292 };
293 Listener *parent;
294 Listener *get_parent() const { return parent; }
295 PGBackend(CephContext* cct, Listener *l, ObjectStore *store, coll_t coll,
296 ObjectStore::CollectionHandle &ch) :
297 cct(cct),
298 store(store),
299 coll(coll),
300 ch(ch),
301 parent(l) {}
302 bool is_primary() const { return get_parent()->pgb_is_primary(); }
303 OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
304 const pg_info_t &get_info() { return get_parent()->get_info(); }
305
306 std::string gen_prefix() const {
307 return parent->gen_dbg_prefix();
308 }
309
310 /**
311 * RecoveryHandle
312 *
313 * We may want to recover multiple objects in the same set of
314 * messages. RecoveryHandle is an interface for the opaque
315 * object used by the implementation to store the details of
316 * the pending recovery operations.
317 */
318 struct RecoveryHandle {
319 bool cache_dont_need;
320 map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
321
322 RecoveryHandle(): cache_dont_need(false) {}
323 virtual ~RecoveryHandle() {}
324 };
325
326 /// Get a fresh recovery operation
327 virtual RecoveryHandle *open_recovery_op() = 0;
328
329 /// run_recovery_op: finish the operation represented by h
330 virtual void run_recovery_op(
331 RecoveryHandle *h, ///< [in] op to finish
332 int priority ///< [in] msg priority
333 ) = 0;
334
335 void recover_delete_object(const hobject_t &oid, eversion_t v,
336 RecoveryHandle *h);
337 void send_recovery_deletes(int prio,
338 const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
339
340 /**
341 * recover_object
342 *
343 * Triggers a recovery operation on the specified hobject_t
344 * onreadable must be called before onwriteable
345 *
346 * On each replica (primary included), get_parent()->on_not_missing()
347 * must be called when the transaction finalizing the recovery
348 * is queued. Similarly, get_parent()->on_readable() must be called
349 * when the transaction is applied in the backing store.
350 *
351 * get_parent()->on_not_degraded() should be called on the primary
352 * when writes can resume on the object.
353 *
354 * obc may be NULL if the primary lacks the object.
355 *
356 * head may be NULL only if the head/snapdir is missing
357 *
358 * @param missing [in] set of info, missing pairs for queried nodes
359 * @param overlaps [in] mapping of object to file offset overlaps
360 */
361 virtual int recover_object(
362 const hobject_t &hoid, ///< [in] object to recover
363 eversion_t v, ///< [in] version to recover
364 ObjectContextRef head, ///< [in] context of the head/snapdir object
365 ObjectContextRef obc, ///< [in] context of the object
366 RecoveryHandle *h ///< [in,out] handle to attach recovery op to
367 ) = 0;
368
369 /**
370 * true if PGBackend can handle this message while inactive
371 *
372 * If it returns true, handle_message *must* also return true
373 */
374 virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
375
376 /// gives PGBackend a crack at an incoming message
377 bool handle_message(
378 OpRequestRef op ///< [in] message received
379 ); ///< @return true if the message was handled
380
381 /// the variant of handle_message that is overridden by child classes
382 virtual bool _handle_message(OpRequestRef op) = 0;
383
384 virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
385
386
387 /**
388 * clean up any temporary on-disk state due to a pg interval change
389 */
390 void on_change_cleanup(ObjectStore::Transaction *t);
391 /**
392 * implementation should clear itself, contexts blessed prior to on_change
393 * won't be called after on_change()
394 */
395 virtual void on_change() = 0;
396 virtual void clear_recovery_state() = 0;
397
398 virtual void on_flushed() = 0;
399
400 virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
401 virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
402
403 virtual void dump_recovery_info(Formatter *f) const = 0;
404
405 private:
406 set<hobject_t> temp_contents;
407 public:
408 // Track contents of temp collection, clear on reset
409 void add_temp_obj(const hobject_t &oid) {
410 temp_contents.insert(oid);
411 }
412 void add_temp_objs(const set<hobject_t> &oids) {
413 temp_contents.insert(oids.begin(), oids.end());
414 }
415 void clear_temp_obj(const hobject_t &oid) {
416 temp_contents.erase(oid);
417 }
418 void clear_temp_objs(const set<hobject_t> &oids) {
419 for (set<hobject_t>::const_iterator i = oids.begin();
420 i != oids.end();
421 ++i) {
422 temp_contents.erase(*i);
423 }
424 }
425
426 virtual ~PGBackend() {}
427
428 /// execute implementation specific transaction
429 virtual void submit_transaction(
430 const hobject_t &hoid, ///< [in] object
431 const object_stat_sum_t &delta_stats,///< [in] stat change
432 const eversion_t &at_version, ///< [in] version
433 PGTransactionUPtr &&t, ///< [in] trans to execute (move)
434 const eversion_t &trim_to, ///< [in] trim log to here
435 const eversion_t &roll_forward_to, ///< [in] trim rollback info to here
436 const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
437 /// [in] hitset history (if updated with this transaction)
438 boost::optional<pg_hit_set_history_t> &hset_history,
439 Context *on_local_applied_sync, ///< [in] called when applied locally
440 Context *on_all_applied, ///< [in] called when all acked
441 Context *on_all_commit, ///< [in] called when all commit
442 ceph_tid_t tid, ///< [in] tid
443 osd_reqid_t reqid, ///< [in] reqid
444 OpRequestRef op ///< [in] op
445 ) = 0;
446
447 /// submit callback to be called in order with pending writes
448 virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
449
450 void try_stash(
451 const hobject_t &hoid,
452 version_t v,
453 ObjectStore::Transaction *t);
454
455 void rollback(
456 const pg_log_entry_t &entry,
457 ObjectStore::Transaction *t);
458
459 friend class LRBTrimmer;
460 void rollforward(
461 const pg_log_entry_t &entry,
462 ObjectStore::Transaction *t);
463
464 void trim(
465 const pg_log_entry_t &entry,
466 ObjectStore::Transaction *t);
467
468 void remove(
469 const hobject_t &hoid,
470 ObjectStore::Transaction *t);
471
472 protected:
473
474 void handle_recovery_delete(OpRequestRef op);
475 void handle_recovery_delete_reply(OpRequestRef op);
476
477 /// Reapply old attributes
478 void rollback_setattrs(
479 const hobject_t &hoid,
480 map<string, boost::optional<bufferlist> > &old_attrs,
481 ObjectStore::Transaction *t);
482
483 /// Truncate object to rollback append
484 virtual void rollback_append(
485 const hobject_t &hoid,
486 uint64_t old_size,
487 ObjectStore::Transaction *t);
488
489 /// Unstash object to rollback stash
490 void rollback_stash(
491 const hobject_t &hoid,
492 version_t old_version,
493 ObjectStore::Transaction *t);
494
495 /// Unstash object to rollback stash
496 void rollback_try_stash(
497 const hobject_t &hoid,
498 version_t old_version,
499 ObjectStore::Transaction *t);
500
501 /// Delete object to rollback create
502 void rollback_create(
503 const hobject_t &hoid,
504 ObjectStore::Transaction *t) {
505 remove(hoid, t);
506 }
507
508 /// Clone the extents back into place
509 void rollback_extents(
510 version_t gen,
511 const vector<pair<uint64_t, uint64_t> > &extents,
512 const hobject_t &hoid,
513 ObjectStore::Transaction *t);
514 public:
515
516 /// Trim object stashed at version
517 void trim_rollback_object(
518 const hobject_t &hoid,
519 version_t gen,
520 ObjectStore::Transaction *t);
521
522 /// List objects in collection
523 int objects_list_partial(
524 const hobject_t &begin,
525 int min,
526 int max,
527 vector<hobject_t> *ls,
528 hobject_t *next);
529
530 int objects_list_range(
531 const hobject_t &start,
532 const hobject_t &end,
533 snapid_t seq,
534 vector<hobject_t> *ls,
535 vector<ghobject_t> *gen_obs=0);
536
537 int objects_get_attr(
538 const hobject_t &hoid,
539 const string &attr,
540 bufferlist *out);
541
542 virtual int objects_get_attrs(
543 const hobject_t &hoid,
544 map<string, bufferlist> *out);
545
546 virtual int objects_read_sync(
547 const hobject_t &hoid,
548 uint64_t off,
549 uint64_t len,
550 uint32_t op_flags,
551 bufferlist *bl) = 0;
552
553 virtual void objects_read_async(
554 const hobject_t &hoid,
555 const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
556 pair<bufferlist*, Context*> > > &to_read,
557 Context *on_complete, bool fast_read = false) = 0;
558
559 virtual bool scrub_supported() = 0;
560 virtual bool auto_repair_supported() const = 0;
561 void be_scan_list(
562 ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
563 ThreadPool::TPHandle &handle);
564 bool be_compare_scrub_objects(
565 pg_shard_t auth_shard,
566 const ScrubMap::object &auth,
567 const object_info_t& auth_oi,
568 const ScrubMap::object &candidate,
569 shard_info_wrapper& shard_error,
570 inconsistent_obj_wrapper &result,
571 ostream &errorstream);
572 map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
573 const hobject_t &obj,
574 const map<pg_shard_t,ScrubMap*> &maps,
575 object_info_t *auth_oi,
576 map<pg_shard_t, shard_info_wrapper> &shard_map,
577 inconsistent_obj_wrapper &object_error);
578 void be_compare_scrubmaps(
579 const map<pg_shard_t,ScrubMap*> &maps,
580 bool repair,
581 map<hobject_t, set<pg_shard_t>> &missing,
582 map<hobject_t, set<pg_shard_t>> &inconsistent,
583 map<hobject_t, list<pg_shard_t>> &authoritative,
584 map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
585 int &shallow_errors, int &deep_errors,
586 Scrub::Store *store,
587 const spg_t& pgid,
588 const vector<int> &acting,
589 ostream &errorstream);
590 virtual uint64_t be_get_ondisk_size(
591 uint64_t logical_size) = 0;
592 virtual void be_deep_scrub(
593 const hobject_t &poid,
594 uint32_t seed,
595 ScrubMap::object &o,
596 ThreadPool::TPHandle &handle) = 0;
597
598 static PGBackend *build_pg_backend(
599 const pg_pool_t &pool,
600 const OSDMapRef curmap,
601 Listener *l,
602 coll_t coll,
603 ObjectStore::CollectionHandle &ch,
604 ObjectStore *store,
605 CephContext *cct);
606 };
607
608 #endif