]>
git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
27 #include "PGTransaction.h"
32 struct shard_info_wrapper
;
33 struct inconsistent_obj_wrapper
;
38 typedef std::shared_ptr
<const OSDMap
> OSDMapRef
;
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
58 ObjectStore::CollectionHandle
&ch
;
61 * Provides interfaces for PGBackend callbacks
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
70 virtual DoutPrefixProvider
*get_dpp() = 0;
75 * Called with the transaction recovering oid
77 virtual void on_local_recover(
79 const ObjectRecoveryInfo
&recovery_info
,
82 ObjectStore::Transaction
*t
86 * Called when transaction recovering oid is durable and
87 * applied on all replicas
89 virtual void on_global_recover(
91 const object_stat_sum_t
&stat_diff
,
96 * Called when peer is recovered
98 virtual void on_peer_recover(
100 const hobject_t
&oid
,
101 const ObjectRecoveryInfo
&recovery_info
104 virtual void begin_peer_recover(
106 const hobject_t oid
) = 0;
108 virtual void failed_push(const list
<pg_shard_t
> &from
,
109 const hobject_t
&soid
,
110 const eversion_t
&need
= eversion_t()) = 0;
111 virtual void finish_degraded_object(const hobject_t
& oid
) = 0;
112 virtual void primary_failed(const hobject_t
&soid
) = 0;
113 virtual bool primary_error(const hobject_t
& soid
, eversion_t v
) = 0;
114 virtual void cancel_pull(const hobject_t
&soid
) = 0;
116 virtual void apply_stats(
117 const hobject_t
&soid
,
118 const object_stat_sum_t
&delta_stats
) = 0;
121 * Called when a read on the primary fails when pushing
123 virtual void on_primary_error(
124 const hobject_t
&oid
,
128 virtual void backfill_add_missing(
129 const hobject_t
&oid
,
133 virtual void remove_missing_object(const hobject_t
&oid
,
135 Context
*on_complete
) = 0;
141 * Wraps a context in whatever outer layers the parent usually
142 * uses to call into the PGBackend
144 virtual Context
*bless_context(Context
*c
) = 0;
145 virtual GenContext
<ThreadPool::TPHandle
&> *bless_gencontext(
146 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
147 virtual GenContext
<ThreadPool::TPHandle
&> *bless_unlocked_gencontext(
148 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
150 virtual void send_message(int to_osd
, Message
*m
) = 0;
151 virtual void queue_transaction(
152 ObjectStore::Transaction
&& t
,
153 OpRequestRef op
= OpRequestRef()
155 virtual void queue_transactions(
156 vector
<ObjectStore::Transaction
>& tls
,
157 OpRequestRef op
= OpRequestRef()
159 virtual epoch_t
get_interval_start_epoch() const = 0;
160 virtual epoch_t
get_last_peering_reset_epoch() const = 0;
162 virtual const set
<pg_shard_t
> &get_acting_recovery_backfill_shards() const = 0;
163 virtual const set
<pg_shard_t
> &get_acting_shards() const = 0;
164 virtual const set
<pg_shard_t
> &get_backfill_shards() const = 0;
166 virtual std::ostream
& gen_dbg_prefix(std::ostream
& out
) const = 0;
168 virtual const map
<hobject_t
, set
<pg_shard_t
>> &get_missing_loc_shards()
171 virtual const pg_missing_tracker_t
&get_local_missing() const = 0;
172 virtual void add_local_next_event(const pg_log_entry_t
& e
) = 0;
173 virtual const map
<pg_shard_t
, pg_missing_t
> &get_shard_missing()
175 virtual boost::optional
<const pg_missing_const_i
&> maybe_get_shard_missing(
176 pg_shard_t peer
) const {
177 if (peer
== primary_shard()) {
178 return get_local_missing();
180 map
<pg_shard_t
, pg_missing_t
>::const_iterator i
=
181 get_shard_missing().find(peer
);
182 if (i
== get_shard_missing().end()) {
183 return boost::optional
<const pg_missing_const_i
&>();
189 virtual const pg_missing_const_i
&get_shard_missing(pg_shard_t peer
) const {
190 auto m
= maybe_get_shard_missing(peer
);
195 virtual const map
<pg_shard_t
, pg_info_t
> &get_shard_info() const = 0;
196 virtual const pg_info_t
&get_shard_info(pg_shard_t peer
) const {
197 if (peer
== primary_shard()) {
200 map
<pg_shard_t
, pg_info_t
>::const_iterator i
=
201 get_shard_info().find(peer
);
202 ceph_assert(i
!= get_shard_info().end());
207 virtual const PGLog
&get_log() const = 0;
208 virtual bool pgb_is_primary() const = 0;
209 virtual const OSDMapRef
& pgb_get_osdmap() const = 0;
210 virtual epoch_t
pgb_get_osdmap_epoch() const = 0;
211 virtual const pg_info_t
&get_info() const = 0;
212 virtual const pg_pool_t
&get_pool() const = 0;
214 virtual ObjectContextRef
get_obc(
215 const hobject_t
&hoid
,
216 const map
<string
, bufferlist
> &attrs
) = 0;
218 virtual bool try_lock_for_read(
219 const hobject_t
&hoid
,
220 ObcLockManager
&manager
) = 0;
222 virtual void release_locks(ObcLockManager
&manager
) = 0;
224 virtual void op_applied(
225 const eversion_t
&applied_version
) = 0;
227 virtual bool should_send_op(
229 const hobject_t
&hoid
) = 0;
231 virtual bool pg_is_undersized() const = 0;
232 virtual bool pg_is_repair() const = 0;
234 virtual void log_operation(
235 const vector
<pg_log_entry_t
> &logv
,
236 const boost::optional
<pg_hit_set_history_t
> &hset_history
,
237 const eversion_t
&trim_to
,
238 const eversion_t
&roll_forward_to
,
239 bool transaction_applied
,
240 ObjectStore::Transaction
&t
,
241 bool async
= false) = 0;
243 virtual void pgb_set_object_snap_mapping(
244 const hobject_t
&soid
,
245 const set
<snapid_t
> &snaps
,
246 ObjectStore::Transaction
*t
) = 0;
248 virtual void pgb_clear_object_snap_mapping(
249 const hobject_t
&soid
,
250 ObjectStore::Transaction
*t
) = 0;
252 virtual void update_peer_last_complete_ondisk(
254 eversion_t lcod
) = 0;
256 virtual void update_last_complete_ondisk(
257 eversion_t lcod
) = 0;
259 virtual void update_stats(
260 const pg_stat_t
&stat
) = 0;
262 virtual void schedule_recovery_work(
263 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
265 virtual pg_shard_t
whoami_shard() const = 0;
267 return whoami_shard().osd
;
269 spg_t
whoami_spg_t() const {
270 return get_info().pgid
;
273 virtual spg_t
primary_spg_t() const = 0;
274 virtual pg_shard_t
primary_shard() const = 0;
276 virtual hobject_t
get_temp_recovery_object(const hobject_t
& target
,
277 eversion_t version
) = 0;
279 virtual void send_message_osd_cluster(
280 int peer
, Message
*m
, epoch_t from_epoch
) = 0;
281 virtual void send_message_osd_cluster(
282 Message
*m
, Connection
*con
) = 0;
283 virtual void send_message_osd_cluster(
284 Message
*m
, const ConnectionRef
& con
) = 0;
285 virtual ConnectionRef
get_con_osd_cluster(int peer
, epoch_t from_epoch
) = 0;
286 virtual entity_name_t
get_cluster_msgr_name() = 0;
288 virtual PerfCounters
*get_logger() = 0;
290 virtual ceph_tid_t
get_tid() = 0;
292 virtual LogClientTemp
clog_error() = 0;
293 virtual LogClientTemp
clog_warn() = 0;
295 virtual bool check_failsafe_full() = 0;
297 virtual bool check_osdmap_full(const set
<pg_shard_t
> &missing_on
) = 0;
299 virtual bool pg_is_repair() = 0;
300 virtual void inc_osd_stat_repaired() = 0;
301 virtual bool pg_is_remote_backfilling() = 0;
302 virtual void pg_add_local_num_bytes(int64_t num_bytes
) = 0;
303 virtual void pg_sub_local_num_bytes(int64_t num_bytes
) = 0;
304 virtual void pg_add_num_bytes(int64_t num_bytes
) = 0;
305 virtual void pg_sub_num_bytes(int64_t num_bytes
) = 0;
306 virtual bool maybe_preempt_replica_scrub(const hobject_t
& oid
) = 0;
307 virtual ~Listener() {}
310 Listener
*get_parent() const { return parent
; }
311 PGBackend(CephContext
* cct
, Listener
*l
, ObjectStore
*store
, const coll_t
&coll
,
312 ObjectStore::CollectionHandle
&ch
) :
318 bool is_primary() const { return get_parent()->pgb_is_primary(); }
319 const OSDMapRef
& get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
320 epoch_t
get_osdmap_epoch() const { return get_parent()->pgb_get_osdmap_epoch(); }
321 const pg_info_t
&get_info() { return get_parent()->get_info(); }
323 std::ostream
& gen_prefix(std::ostream
& out
) const {
324 return parent
->gen_dbg_prefix(out
);
330 * We may want to recover multiple objects in the same set of
331 * messages. RecoveryHandle is an interface for the opaque
332 * object used by the implementation to store the details of
333 * the pending recovery operations.
335 struct RecoveryHandle
{
336 bool cache_dont_need
;
337 map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > deletes
;
339 RecoveryHandle(): cache_dont_need(false) {}
340 virtual ~RecoveryHandle() {}
343 /// Get a fresh recovery operation
344 virtual RecoveryHandle
*open_recovery_op() = 0;
346 /// run_recovery_op: finish the operation represented by h
347 virtual void run_recovery_op(
348 RecoveryHandle
*h
, ///< [in] op to finish
349 int priority
///< [in] msg priority
352 void recover_delete_object(const hobject_t
&oid
, eversion_t v
,
354 void send_recovery_deletes(int prio
,
355 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
);
360 * Triggers a recovery operation on the specified hobject_t
361 * onreadable must be called before onwriteable
363 * On each replica (primary included), get_parent()->on_not_missing()
364 * must be called when the transaction finalizing the recovery
365 * is queued. Similarly, get_parent()->on_readable() must be called
366 * when the transaction is applied in the backing store.
368 * get_parent()->on_not_degraded() should be called on the primary
369 * when writes can resume on the object.
371 * obc may be NULL if the primary lacks the object.
373 * head may be NULL only if the head/snapdir is missing
375 * @param missing [in] set of info, missing pairs for queried nodes
376 * @param overlaps [in] mapping of object to file offset overlaps
378 virtual int recover_object(
379 const hobject_t
&hoid
, ///< [in] object to recover
380 eversion_t v
, ///< [in] version to recover
381 ObjectContextRef head
, ///< [in] context of the head/snapdir object
382 ObjectContextRef obc
, ///< [in] context of the object
383 RecoveryHandle
*h
///< [in,out] handle to attach recovery op to
387 * true if PGBackend can handle this message while inactive
389 * If it returns true, handle_message *must* also return true
391 virtual bool can_handle_while_inactive(OpRequestRef op
) = 0;
393 /// gives PGBackend a crack at an incoming message
395 OpRequestRef op
///< [in] message received
396 ); ///< @return true if the message was handled
398 /// the variant of handle_message that is overridden by child classes
399 virtual bool _handle_message(OpRequestRef op
) = 0;
401 virtual void check_recovery_sources(const OSDMapRef
& osdmap
) = 0;
405 * clean up any temporary on-disk state due to a pg interval change
407 void on_change_cleanup(ObjectStore::Transaction
*t
);
409 * implementation should clear itself, contexts blessed prior to on_change
410 * won't be called after on_change()
412 virtual void on_change() = 0;
413 virtual void clear_recovery_state() = 0;
415 virtual IsPGRecoverablePredicate
*get_is_recoverable_predicate() const = 0;
416 virtual IsPGReadablePredicate
*get_is_readable_predicate() const = 0;
417 virtual int get_ec_data_chunk_count() const { return 0; };
418 virtual int get_ec_stripe_chunk_size() const { return 0; };
420 virtual void dump_recovery_info(Formatter
*f
) const = 0;
423 set
<hobject_t
> temp_contents
;
425 // Track contents of temp collection, clear on reset
426 void add_temp_obj(const hobject_t
&oid
) {
427 temp_contents
.insert(oid
);
429 void add_temp_objs(const set
<hobject_t
> &oids
) {
430 temp_contents
.insert(oids
.begin(), oids
.end());
432 void clear_temp_obj(const hobject_t
&oid
) {
433 temp_contents
.erase(oid
);
435 void clear_temp_objs(const set
<hobject_t
> &oids
) {
436 for (set
<hobject_t
>::const_iterator i
= oids
.begin();
439 temp_contents
.erase(*i
);
443 virtual ~PGBackend() {}
445 /// execute implementation specific transaction
446 virtual void submit_transaction(
447 const hobject_t
&hoid
, ///< [in] object
448 const object_stat_sum_t
&delta_stats
,///< [in] stat change
449 const eversion_t
&at_version
, ///< [in] version
450 PGTransactionUPtr
&&t
, ///< [in] trans to execute (move)
451 const eversion_t
&trim_to
, ///< [in] trim log to here
452 const eversion_t
&roll_forward_to
, ///< [in] trim rollback info to here
453 const vector
<pg_log_entry_t
> &log_entries
, ///< [in] log entries for t
454 /// [in] hitset history (if updated with this transaction)
455 boost::optional
<pg_hit_set_history_t
> &hset_history
,
456 Context
*on_all_commit
, ///< [in] called when all commit
457 ceph_tid_t tid
, ///< [in] tid
458 osd_reqid_t reqid
, ///< [in] reqid
459 OpRequestRef op
///< [in] op
462 /// submit callback to be called in order with pending writes
463 virtual void call_write_ordered(std::function
<void(void)> &&cb
) = 0;
466 const hobject_t
&hoid
,
468 ObjectStore::Transaction
*t
);
471 const pg_log_entry_t
&entry
,
472 ObjectStore::Transaction
*t
);
474 friend class LRBTrimmer
;
476 const pg_log_entry_t
&entry
,
477 ObjectStore::Transaction
*t
);
480 const pg_log_entry_t
&entry
,
481 ObjectStore::Transaction
*t
);
484 const hobject_t
&hoid
,
485 ObjectStore::Transaction
*t
);
489 void handle_recovery_delete(OpRequestRef op
);
490 void handle_recovery_delete_reply(OpRequestRef op
);
492 /// Reapply old attributes
493 void rollback_setattrs(
494 const hobject_t
&hoid
,
495 map
<string
, boost::optional
<bufferlist
> > &old_attrs
,
496 ObjectStore::Transaction
*t
);
498 /// Truncate object to rollback append
499 virtual void rollback_append(
500 const hobject_t
&hoid
,
502 ObjectStore::Transaction
*t
);
504 /// Unstash object to rollback stash
506 const hobject_t
&hoid
,
507 version_t old_version
,
508 ObjectStore::Transaction
*t
);
510 /// Unstash object to rollback stash
511 void rollback_try_stash(
512 const hobject_t
&hoid
,
513 version_t old_version
,
514 ObjectStore::Transaction
*t
);
516 /// Delete object to rollback create
517 void rollback_create(
518 const hobject_t
&hoid
,
519 ObjectStore::Transaction
*t
) {
523 /// Clone the extents back into place
524 void rollback_extents(
526 const vector
<pair
<uint64_t, uint64_t> > &extents
,
527 const hobject_t
&hoid
,
528 ObjectStore::Transaction
*t
);
531 /// Trim object stashed at version
532 void trim_rollback_object(
533 const hobject_t
&hoid
,
535 ObjectStore::Transaction
*t
);
537 /// List objects in collection
538 int objects_list_partial(
539 const hobject_t
&begin
,
542 vector
<hobject_t
> *ls
,
545 int objects_list_range(
546 const hobject_t
&start
,
547 const hobject_t
&end
,
548 vector
<hobject_t
> *ls
,
549 vector
<ghobject_t
> *gen_obs
=0);
551 int objects_get_attr(
552 const hobject_t
&hoid
,
556 virtual int objects_get_attrs(
557 const hobject_t
&hoid
,
558 map
<string
, bufferlist
> *out
);
560 virtual int objects_read_sync(
561 const hobject_t
&hoid
,
567 virtual void objects_read_async(
568 const hobject_t
&hoid
,
569 const list
<pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
570 pair
<bufferlist
*, Context
*> > > &to_read
,
571 Context
*on_complete
, bool fast_read
= false) = 0;
573 virtual bool auto_repair_supported() const = 0;
576 ScrubMapBuilder
&pos
);
577 bool be_compare_scrub_objects(
578 pg_shard_t auth_shard
,
579 const ScrubMap::object
&auth
,
580 const object_info_t
& auth_oi
,
581 const ScrubMap::object
&candidate
,
582 shard_info_wrapper
& shard_error
,
583 inconsistent_obj_wrapper
&result
,
584 ostream
&errorstream
,
586 map
<pg_shard_t
, ScrubMap
*>::const_iterator
be_select_auth_object(
587 const hobject_t
&obj
,
588 const map
<pg_shard_t
,ScrubMap
*> &maps
,
589 object_info_t
*auth_oi
,
590 map
<pg_shard_t
, shard_info_wrapper
> &shard_map
,
593 ostream
&errorstream
);
594 void be_compare_scrubmaps(
595 const map
<pg_shard_t
,ScrubMap
*> &maps
,
596 const set
<hobject_t
> &master_set
,
598 map
<hobject_t
, set
<pg_shard_t
>> &missing
,
599 map
<hobject_t
, set
<pg_shard_t
>> &inconsistent
,
600 map
<hobject_t
, list
<pg_shard_t
>> &authoritative
,
601 map
<hobject_t
, pair
<boost::optional
<uint32_t>,
602 boost::optional
<uint32_t>>> &missing_digest
,
603 int &shallow_errors
, int &deep_errors
,
606 const vector
<int> &acting
,
607 ostream
&errorstream
);
608 virtual uint64_t be_get_ondisk_size(
609 uint64_t logical_size
) = 0;
610 virtual int be_deep_scrub(
611 const hobject_t
&oid
,
613 ScrubMapBuilder
&pos
,
614 ScrubMap::object
&o
) = 0;
616 const map
<pg_shard_t
,ScrubMap
*> &maps
,
617 const set
<hobject_t
> &master_set
,
618 omap_stat_t
& omap_stats
,
619 ostream
&warnstream
) const;
621 static PGBackend
*build_pg_backend(
622 const pg_pool_t
&pool
,
623 const map
<string
,string
>& profile
,
626 ObjectStore::CollectionHandle
&ch
,