]>
git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
27 #include "PGTransaction.h"
28 #include "common/ostream_temp.h"
33 struct shard_info_wrapper
;
34 struct inconsistent_obj_wrapper
;
39 typedef std::shared_ptr
<const OSDMap
> OSDMapRef
;
44 * PGBackend defines an interface for logic handling IO and
45 * replication on RADOS objects. The PGBackend implementation
48 * 1) Handling client operations
49 * 2) Handling object recovery
50 * 3) Handling object access
51 * 4) Handling scrub, deep-scrub, repair
59 ObjectStore::CollectionHandle
&ch
;
62 * Provides interfaces for PGBackend callbacks
64 * The intention is that the parent calls into the PGBackend
65 * implementation holding a lock and that the callbacks are
66 * called under the same locks.
71 virtual DoutPrefixProvider
*get_dpp() = 0;
76 * Called with the transaction recovering oid
78 virtual void on_local_recover(
80 const ObjectRecoveryInfo
&recovery_info
,
83 ObjectStore::Transaction
*t
87 * Called when transaction recovering oid is durable and
88 * applied on all replicas
90 virtual void on_global_recover(
92 const object_stat_sum_t
&stat_diff
,
97 * Called when peer is recovered
99 virtual void on_peer_recover(
101 const hobject_t
&oid
,
102 const ObjectRecoveryInfo
&recovery_info
105 virtual void begin_peer_recover(
107 const hobject_t oid
) = 0;
109 virtual void apply_stats(
110 const hobject_t
&soid
,
111 const object_stat_sum_t
&delta_stats
) = 0;
114 * Called when a read from a std::set of replicas/primary fails
116 virtual void on_failed_pull(
117 const std::set
<pg_shard_t
> &from
,
118 const hobject_t
&soid
,
123 * Called when a pull on soid cannot be completed due to
126 virtual void cancel_pull(
127 const hobject_t
&soid
) = 0;
130 * Called to remove an object.
132 virtual void remove_missing_object(
133 const hobject_t
&oid
,
135 Context
*on_complete
) = 0;
140 * Wraps a context in whatever outer layers the parent usually
141 * uses to call into the PGBackend
143 virtual Context
*bless_context(Context
*c
) = 0;
144 virtual GenContext
<ThreadPool::TPHandle
&> *bless_gencontext(
145 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
146 virtual GenContext
<ThreadPool::TPHandle
&> *bless_unlocked_gencontext(
147 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
149 virtual void send_message(int to_osd
, Message
*m
) = 0;
150 virtual void queue_transaction(
151 ObjectStore::Transaction
&& t
,
152 OpRequestRef op
= OpRequestRef()
154 virtual void queue_transactions(
155 std::vector
<ObjectStore::Transaction
>& tls
,
156 OpRequestRef op
= OpRequestRef()
158 virtual epoch_t
get_interval_start_epoch() const = 0;
159 virtual epoch_t
get_last_peering_reset_epoch() const = 0;
161 virtual const std::set
<pg_shard_t
> &get_acting_recovery_backfill_shards() const = 0;
162 virtual const std::set
<pg_shard_t
> &get_acting_shards() const = 0;
163 virtual const std::set
<pg_shard_t
> &get_backfill_shards() const = 0;
165 virtual std::ostream
& gen_dbg_prefix(std::ostream
& out
) const = 0;
167 virtual const std::map
<hobject_t
, std::set
<pg_shard_t
>> &get_missing_loc_shards()
170 virtual const pg_missing_tracker_t
&get_local_missing() const = 0;
171 virtual void add_local_next_event(const pg_log_entry_t
& e
) = 0;
172 virtual const std::map
<pg_shard_t
, pg_missing_t
> &get_shard_missing()
174 virtual const pg_missing_const_i
* maybe_get_shard_missing(
175 pg_shard_t peer
) const {
176 if (peer
== primary_shard()) {
177 return &get_local_missing();
179 std::map
<pg_shard_t
, pg_missing_t
>::const_iterator i
=
180 get_shard_missing().find(peer
);
181 if (i
== get_shard_missing().end()) {
188 virtual const pg_missing_const_i
&get_shard_missing(pg_shard_t peer
) const {
189 auto m
= maybe_get_shard_missing(peer
);
194 virtual const std::map
<pg_shard_t
, pg_info_t
> &get_shard_info() const = 0;
195 virtual const pg_info_t
&get_shard_info(pg_shard_t peer
) const {
196 if (peer
== primary_shard()) {
199 std::map
<pg_shard_t
, pg_info_t
>::const_iterator i
=
200 get_shard_info().find(peer
);
201 ceph_assert(i
!= get_shard_info().end());
206 virtual const PGLog
&get_log() const = 0;
207 virtual bool pgb_is_primary() const = 0;
208 virtual const OSDMapRef
& pgb_get_osdmap() const = 0;
209 virtual epoch_t
pgb_get_osdmap_epoch() const = 0;
210 virtual const pg_info_t
&get_info() const = 0;
211 virtual const pg_pool_t
&get_pool() const = 0;
213 virtual ObjectContextRef
get_obc(
214 const hobject_t
&hoid
,
215 const std::map
<std::string
, ceph::buffer::list
, std::less
<>> &attrs
) = 0;
217 virtual bool try_lock_for_read(
218 const hobject_t
&hoid
,
219 ObcLockManager
&manager
) = 0;
221 virtual void release_locks(ObcLockManager
&manager
) = 0;
223 virtual void op_applied(
224 const eversion_t
&applied_version
) = 0;
226 virtual bool should_send_op(
228 const hobject_t
&hoid
) = 0;
230 virtual bool pg_is_undersized() const = 0;
231 virtual bool pg_is_repair() const = 0;
233 virtual void log_operation(
234 std::vector
<pg_log_entry_t
>&& logv
,
235 const std::optional
<pg_hit_set_history_t
> &hset_history
,
236 const eversion_t
&trim_to
,
237 const eversion_t
&roll_forward_to
,
238 const eversion_t
&min_last_complete_ondisk
,
239 bool transaction_applied
,
240 ObjectStore::Transaction
&t
,
241 bool async
= false) = 0;
243 virtual void pgb_set_object_snap_mapping(
244 const hobject_t
&soid
,
245 const std::set
<snapid_t
> &snaps
,
246 ObjectStore::Transaction
*t
) = 0;
248 virtual void pgb_clear_object_snap_mapping(
249 const hobject_t
&soid
,
250 ObjectStore::Transaction
*t
) = 0;
252 virtual void update_peer_last_complete_ondisk(
254 eversion_t lcod
) = 0;
256 virtual void update_last_complete_ondisk(
257 eversion_t lcod
) = 0;
259 virtual void update_stats(
260 const pg_stat_t
&stat
) = 0;
262 virtual void schedule_recovery_work(
263 GenContext
<ThreadPool::TPHandle
&> *c
,
266 virtual pg_shard_t
whoami_shard() const = 0;
268 return whoami_shard().osd
;
270 spg_t
whoami_spg_t() const {
271 return get_info().pgid
;
274 virtual spg_t
primary_spg_t() const = 0;
275 virtual pg_shard_t
primary_shard() const = 0;
276 virtual uint64_t min_peer_features() const = 0;
277 virtual uint64_t min_upacting_features() const = 0;
278 virtual hobject_t
get_temp_recovery_object(const hobject_t
& target
,
279 eversion_t version
) = 0;
281 virtual void send_message_osd_cluster(
282 int peer
, Message
*m
, epoch_t from_epoch
) = 0;
283 virtual void send_message_osd_cluster(
284 std::vector
<std::pair
<int, Message
*>>& messages
, epoch_t from_epoch
) = 0;
285 virtual void send_message_osd_cluster(
286 MessageRef
, Connection
*con
) = 0;
287 virtual void send_message_osd_cluster(
288 Message
*m
, const ConnectionRef
& con
) = 0;
289 virtual ConnectionRef
get_con_osd_cluster(int peer
, epoch_t from_epoch
) = 0;
290 virtual entity_name_t
get_cluster_msgr_name() = 0;
292 virtual PerfCounters
*get_logger() = 0;
294 virtual ceph_tid_t
get_tid() = 0;
296 virtual OstreamTemp
clog_error() = 0;
297 virtual OstreamTemp
clog_warn() = 0;
299 virtual bool check_failsafe_full() = 0;
301 virtual void inc_osd_stat_repaired() = 0;
302 virtual bool pg_is_remote_backfilling() = 0;
303 virtual void pg_add_local_num_bytes(int64_t num_bytes
) = 0;
304 virtual void pg_sub_local_num_bytes(int64_t num_bytes
) = 0;
305 virtual void pg_add_num_bytes(int64_t num_bytes
) = 0;
306 virtual void pg_sub_num_bytes(int64_t num_bytes
) = 0;
307 virtual bool maybe_preempt_replica_scrub(const hobject_t
& oid
) = 0;
308 virtual ~Listener() {}
311 Listener
*get_parent() const { return parent
; }
312 PGBackend(CephContext
* cct
, Listener
*l
, ObjectStore
*store
, const coll_t
&coll
,
313 ObjectStore::CollectionHandle
&ch
) :
319 bool is_primary() const { return get_parent()->pgb_is_primary(); }
320 const OSDMapRef
& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
321 epoch_t
get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
322 const pg_info_t
&get_info() { return get_parent()->get_info(); }
324 std::ostream
& gen_prefix(std::ostream
& out
) const {
325 return parent
->gen_dbg_prefix(out
);
331 * We may want to recover multiple objects in the same std::set of
332 * messages. RecoveryHandle is an interface for the opaque
333 * object used by the implementation to store the details of
334 * the pending recovery operations.
336 struct RecoveryHandle
{
337 bool cache_dont_need
;
338 std::map
<pg_shard_t
, std::vector
<std::pair
<hobject_t
, eversion_t
> > > deletes
;
340 RecoveryHandle(): cache_dont_need(false) {}
341 virtual ~RecoveryHandle() {}
344 /// Get a fresh recovery operation
345 virtual RecoveryHandle
*open_recovery_op() = 0;
347 /// run_recovery_op: finish the operation represented by h
348 virtual void run_recovery_op(
349 RecoveryHandle
*h
, ///< [in] op to finish
350 int priority
///< [in] msg priority
353 void recover_delete_object(const hobject_t
&oid
, eversion_t v
,
355 void send_recovery_deletes(int prio
,
356 const std::map
<pg_shard_t
, std::vector
<std::pair
<hobject_t
, eversion_t
> > > &deletes
);
361 * Triggers a recovery operation on the specified hobject_t
362 * onreadable must be called before onwriteable
364 * On each replica (primary included), get_parent()->on_not_missing()
365 * must be called when the transaction finalizing the recovery
366 * is queued. Similarly, get_parent()->on_readable() must be called
367 * when the transaction is applied in the backing store.
369 * get_parent()->on_not_degraded() should be called on the primary
370 * when writes can resume on the object.
372 * obc may be NULL if the primary lacks the object.
374 * head may be NULL only if the head/snapdir is missing
376 * @param missing [in] std::set of info, missing pairs for queried nodes
377 * @param overlaps [in] mapping of object to file offset overlaps
379 virtual int recover_object(
380 const hobject_t
&hoid
, ///< [in] object to recover
381 eversion_t v
, ///< [in] version to recover
382 ObjectContextRef head
, ///< [in] context of the head/snapdir object
383 ObjectContextRef obc
, ///< [in] context of the object
384 RecoveryHandle
*h
///< [in,out] handle to attach recovery op to
388 * true if PGBackend can handle this message while inactive
390 * If it returns true, handle_message *must* also return true
392 virtual bool can_handle_while_inactive(OpRequestRef op
) = 0;
394 /// gives PGBackend a crack at an incoming message
396 OpRequestRef op
///< [in] message received
397 ); ///< @return true if the message was handled
399 /// the variant of handle_message that is overridden by child classes
400 virtual bool _handle_message(OpRequestRef op
) = 0;
402 virtual void check_recovery_sources(const OSDMapRef
& osdmap
) = 0;
406 * clean up any temporary on-disk state due to a pg interval change
408 void on_change_cleanup(ObjectStore::Transaction
*t
);
410 * implementation should clear itself, contexts blessed prior to on_change
411 * won't be called after on_change()
413 virtual void on_change() = 0;
414 virtual void clear_recovery_state() = 0;
416 virtual IsPGRecoverablePredicate
*get_is_recoverable_predicate() const = 0;
417 virtual IsPGReadablePredicate
*get_is_readable_predicate() const = 0;
418 virtual int get_ec_data_chunk_count() const { return 0; };
419 virtual int get_ec_stripe_chunk_size() const { return 0; };
421 virtual void dump_recovery_info(ceph::Formatter
*f
) const = 0;
424 std::set
<hobject_t
> temp_contents
;
426 // Track contents of temp collection, clear on reset
427 void add_temp_obj(const hobject_t
&oid
) {
428 temp_contents
.insert(oid
);
430 void add_temp_objs(const std::set
<hobject_t
> &oids
) {
431 temp_contents
.insert(oids
.begin(), oids
.end());
433 void clear_temp_obj(const hobject_t
&oid
) {
434 temp_contents
.erase(oid
);
436 void clear_temp_objs(const std::set
<hobject_t
> &oids
) {
437 for (std::set
<hobject_t
>::const_iterator i
= oids
.begin();
440 temp_contents
.erase(*i
);
444 virtual ~PGBackend() {}
446 /// execute implementation specific transaction
447 virtual void submit_transaction(
448 const hobject_t
&hoid
, ///< [in] object
449 const object_stat_sum_t
&delta_stats
,///< [in] stat change
450 const eversion_t
&at_version
, ///< [in] version
451 PGTransactionUPtr
&&t
, ///< [in] trans to execute (move)
452 const eversion_t
&trim_to
, ///< [in] trim log to here
453 const eversion_t
&min_last_complete_ondisk
, ///< [in] lower bound on
454 /// committed version
455 std::vector
<pg_log_entry_t
>&& log_entries
, ///< [in] log entries for t
456 /// [in] hitset history (if updated with this transaction)
457 std::optional
<pg_hit_set_history_t
> &hset_history
,
458 Context
*on_all_commit
, ///< [in] called when all commit
459 ceph_tid_t tid
, ///< [in] tid
460 osd_reqid_t reqid
, ///< [in] reqid
461 OpRequestRef op
///< [in] op
464 /// submit callback to be called in order with pending writes
465 virtual void call_write_ordered(std::function
<void(void)> &&cb
) = 0;
468 const hobject_t
&hoid
,
470 ObjectStore::Transaction
*t
);
473 const pg_log_entry_t
&entry
,
474 ObjectStore::Transaction
*t
);
476 friend class LRBTrimmer
;
478 const pg_log_entry_t
&entry
,
479 ObjectStore::Transaction
*t
);
482 const pg_log_entry_t
&entry
,
483 ObjectStore::Transaction
*t
);
486 const hobject_t
&hoid
,
487 ObjectStore::Transaction
*t
);
491 void handle_recovery_delete(OpRequestRef op
);
492 void handle_recovery_delete_reply(OpRequestRef op
);
494 /// Reapply old attributes
495 void rollback_setattrs(
496 const hobject_t
&hoid
,
497 std::map
<std::string
, std::optional
<ceph::buffer::list
> > &old_attrs
,
498 ObjectStore::Transaction
*t
);
500 /// Truncate object to rollback append
501 virtual void rollback_append(
502 const hobject_t
&hoid
,
504 ObjectStore::Transaction
*t
);
506 /// Unstash object to rollback stash
508 const hobject_t
&hoid
,
509 version_t old_version
,
510 ObjectStore::Transaction
*t
);
512 /// Unstash object to rollback stash
513 void rollback_try_stash(
514 const hobject_t
&hoid
,
515 version_t old_version
,
516 ObjectStore::Transaction
*t
);
518 /// Delete object to rollback create
519 void rollback_create(
520 const hobject_t
&hoid
,
521 ObjectStore::Transaction
*t
) {
525 /// Clone the extents back into place
526 void rollback_extents(
528 const std::vector
<std::pair
<uint64_t, uint64_t> > &extents
,
529 const hobject_t
&hoid
,
530 ObjectStore::Transaction
*t
);
533 /// Trim object stashed at version
534 void trim_rollback_object(
535 const hobject_t
&hoid
,
537 ObjectStore::Transaction
*t
);
539 /// Std::list objects in collection
540 int objects_list_partial(
541 const hobject_t
&begin
,
544 std::vector
<hobject_t
> *ls
,
547 int objects_list_range(
548 const hobject_t
&start
,
549 const hobject_t
&end
,
550 std::vector
<hobject_t
> *ls
,
551 std::vector
<ghobject_t
> *gen_obs
=0);
553 int objects_get_attr(
554 const hobject_t
&hoid
,
555 const std::string
&attr
,
556 ceph::buffer::list
*out
);
558 virtual int objects_get_attrs(
559 const hobject_t
&hoid
,
560 std::map
<std::string
, ceph::buffer::list
, std::less
<>> *out
);
562 virtual int objects_read_sync(
563 const hobject_t
&hoid
,
567 ceph::buffer::list
*bl
) = 0;
569 virtual int objects_readv_sync(
570 const hobject_t
&hoid
,
571 std::map
<uint64_t, uint64_t>&& m
,
573 ceph::buffer::list
*bl
) {
577 virtual void objects_read_async(
578 const hobject_t
&hoid
,
579 const std::list
<std::pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
580 std::pair
<ceph::buffer::list
*, Context
*> > > &to_read
,
581 Context
*on_complete
, bool fast_read
= false) = 0;
583 virtual bool auto_repair_supported() const = 0;
586 ScrubMapBuilder
&pos
);
588 virtual uint64_t be_get_ondisk_size(
589 uint64_t logical_size
) const = 0;
591 virtual int be_deep_scrub(
592 const hobject_t
&oid
,
594 ScrubMapBuilder
&pos
,
595 ScrubMap::object
&o
) = 0;
597 static PGBackend
*build_pg_backend(
598 const pg_pool_t
&pool
,
599 const std::map
<std::string
,std::string
>& profile
,
602 ObjectStore::CollectionHandle
&ch
,