]>
git.proxmox.com Git - ceph.git/blob - ceph/src/osd/PGBackend.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013,2014 Inktank Storage, Inc.
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
21 #include "osd_types.h"
22 #include "common/WorkQueue.h"
23 #include "include/Context.h"
24 #include "os/ObjectStore.h"
25 #include "common/LogClient.h"
27 #include "PGTransaction.h"
32 struct shard_info_wrapper
;
33 struct inconsistent_obj_wrapper
;
38 typedef ceph::shared_ptr
<const OSDMap
> OSDMapRef
;
43 * PGBackend defines an interface for logic handling IO and
44 * replication on RADOS objects. The PGBackend implementation
47 * 1) Handling client operations
48 * 2) Handling object recovery
49 * 3) Handling object access
50 * 4) Handling scrub, deep-scrub, repair
58 ObjectStore::CollectionHandle
&ch
;
61 * Provides interfaces for PGBackend callbacks
63 * The intention is that the parent calls into the PGBackend
64 * implementation holding a lock and that the callbacks are
65 * called under the same locks.
70 virtual DoutPrefixProvider
*get_dpp() = 0;
75 * Called with the transaction recovering oid
77 virtual void on_local_recover(
79 const ObjectRecoveryInfo
&recovery_info
,
82 ObjectStore::Transaction
*t
86 * Called when transaction recovering oid is durable and
87 * applied on all replicas
89 virtual void on_global_recover(
91 const object_stat_sum_t
&stat_diff
,
96 * Called when peer is recovered
98 virtual void on_peer_recover(
100 const hobject_t
&oid
,
101 const ObjectRecoveryInfo
&recovery_info
104 virtual void begin_peer_recover(
106 const hobject_t oid
) = 0;
108 virtual void failed_push(const list
<pg_shard_t
> &from
, const hobject_t
&soid
) = 0;
109 virtual void finish_degraded_object(const hobject_t
& oid
) = 0;
110 virtual void primary_failed(const hobject_t
&soid
) = 0;
111 virtual bool primary_error(const hobject_t
& soid
, eversion_t v
) = 0;
112 virtual void cancel_pull(const hobject_t
&soid
) = 0;
114 virtual void apply_stats(
115 const hobject_t
&soid
,
116 const object_stat_sum_t
&delta_stats
) = 0;
119 * Called when a read on the primary fails when pushing
121 virtual void on_primary_error(
122 const hobject_t
&oid
,
126 virtual void backfill_add_missing(
127 const hobject_t
&oid
,
131 virtual void remove_missing_object(const hobject_t
&oid
,
133 Context
*on_complete
) = 0;
139 * Wraps a context in whatever outer layers the parent usually
140 * uses to call into the PGBackend
142 virtual Context
*bless_context(Context
*c
) = 0;
143 virtual GenContext
<ThreadPool::TPHandle
&> *bless_gencontext(
144 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
146 virtual void send_message(int to_osd
, Message
*m
) = 0;
147 virtual void queue_transaction(
148 ObjectStore::Transaction
&& t
,
149 OpRequestRef op
= OpRequestRef()
151 virtual void queue_transactions(
152 vector
<ObjectStore::Transaction
>& tls
,
153 OpRequestRef op
= OpRequestRef()
155 virtual epoch_t
get_epoch() const = 0;
156 virtual epoch_t
get_interval_start_epoch() const = 0;
157 virtual epoch_t
get_last_peering_reset_epoch() const = 0;
159 virtual const set
<pg_shard_t
> &get_actingbackfill_shards() const = 0;
160 virtual const set
<pg_shard_t
> &get_acting_shards() const = 0;
161 virtual const set
<pg_shard_t
> &get_backfill_shards() const = 0;
163 virtual std::string
gen_dbg_prefix() const = 0;
165 virtual const map
<hobject_t
, set
<pg_shard_t
>> &get_missing_loc_shards()
168 virtual const pg_missing_tracker_t
&get_local_missing() const = 0;
169 virtual const map
<pg_shard_t
, pg_missing_t
> &get_shard_missing()
171 virtual boost::optional
<const pg_missing_const_i
&> maybe_get_shard_missing(
172 pg_shard_t peer
) const {
173 if (peer
== primary_shard()) {
174 return get_local_missing();
176 map
<pg_shard_t
, pg_missing_t
>::const_iterator i
=
177 get_shard_missing().find(peer
);
178 if (i
== get_shard_missing().end()) {
179 return boost::optional
<const pg_missing_const_i
&>();
185 virtual const pg_missing_const_i
&get_shard_missing(pg_shard_t peer
) const {
186 auto m
= maybe_get_shard_missing(peer
);
191 virtual const map
<pg_shard_t
, pg_info_t
> &get_shard_info() const = 0;
192 virtual const pg_info_t
&get_shard_info(pg_shard_t peer
) const {
193 if (peer
== primary_shard()) {
196 map
<pg_shard_t
, pg_info_t
>::const_iterator i
=
197 get_shard_info().find(peer
);
198 assert(i
!= get_shard_info().end());
203 virtual const PGLog
&get_log() const = 0;
204 virtual bool pgb_is_primary() const = 0;
205 virtual OSDMapRef
pgb_get_osdmap() const = 0;
206 virtual const pg_info_t
&get_info() const = 0;
207 virtual const pg_pool_t
&get_pool() const = 0;
209 virtual ObjectContextRef
get_obc(
210 const hobject_t
&hoid
,
211 const map
<string
, bufferlist
> &attrs
) = 0;
213 virtual bool try_lock_for_read(
214 const hobject_t
&hoid
,
215 ObcLockManager
&manager
) = 0;
217 virtual void release_locks(ObcLockManager
&manager
) = 0;
219 virtual void op_applied(
220 const eversion_t
&applied_version
) = 0;
222 virtual bool should_send_op(
224 const hobject_t
&hoid
) = 0;
226 virtual void log_operation(
227 const vector
<pg_log_entry_t
> &logv
,
228 const boost::optional
<pg_hit_set_history_t
> &hset_history
,
229 const eversion_t
&trim_to
,
230 const eversion_t
&roll_forward_to
,
231 bool transaction_applied
,
232 ObjectStore::Transaction
&t
) = 0;
234 virtual void pgb_set_object_snap_mapping(
235 const hobject_t
&soid
,
236 const set
<snapid_t
> &snaps
,
237 ObjectStore::Transaction
*t
) = 0;
239 virtual void pgb_clear_object_snap_mapping(
240 const hobject_t
&soid
,
241 ObjectStore::Transaction
*t
) = 0;
243 virtual void update_peer_last_complete_ondisk(
245 eversion_t lcod
) = 0;
247 virtual void update_last_complete_ondisk(
248 eversion_t lcod
) = 0;
250 virtual void update_stats(
251 const pg_stat_t
&stat
) = 0;
253 virtual void schedule_recovery_work(
254 GenContext
<ThreadPool::TPHandle
&> *c
) = 0;
256 virtual pg_shard_t
whoami_shard() const = 0;
258 return whoami_shard().osd
;
260 spg_t
whoami_spg_t() const {
261 return get_info().pgid
;
264 virtual spg_t
primary_spg_t() const = 0;
265 virtual pg_shard_t
primary_shard() const = 0;
267 virtual uint64_t min_peer_features() const = 0;
269 virtual hobject_t
get_temp_recovery_object(const hobject_t
& target
,
270 eversion_t version
) = 0;
272 virtual void send_message_osd_cluster(
273 int peer
, Message
*m
, epoch_t from_epoch
) = 0;
274 virtual void send_message_osd_cluster(
275 Message
*m
, Connection
*con
) = 0;
276 virtual void send_message_osd_cluster(
277 Message
*m
, const ConnectionRef
& con
) = 0;
278 virtual ConnectionRef
get_con_osd_cluster(int peer
, epoch_t from_epoch
) = 0;
279 virtual entity_name_t
get_cluster_msgr_name() = 0;
281 virtual PerfCounters
*get_logger() = 0;
283 virtual ceph_tid_t
get_tid() = 0;
285 virtual LogClientTemp
clog_error() = 0;
286 virtual LogClientTemp
clog_warn() = 0;
288 virtual bool check_failsafe_full(ostream
&ss
) = 0;
290 virtual bool check_osdmap_full(const set
<pg_shard_t
> &missing_on
) = 0;
292 virtual bool maybe_preempt_replica_scrub(const hobject_t
& oid
) = 0;
293 virtual ~Listener() {}
296 Listener
*get_parent() const { return parent
; }
297 PGBackend(CephContext
* cct
, Listener
*l
, ObjectStore
*store
, coll_t coll
,
298 ObjectStore::CollectionHandle
&ch
) :
304 bool is_primary() const { return get_parent()->pgb_is_primary(); }
305 OSDMapRef
get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
306 const pg_info_t
&get_info() { return get_parent()->get_info(); }
308 std::string
gen_prefix() const {
309 return parent
->gen_dbg_prefix();
315 * We may want to recover multiple objects in the same set of
316 * messages. RecoveryHandle is an interface for the opaque
317 * object used by the implementation to store the details of
318 * the pending recovery operations.
320 struct RecoveryHandle
{
321 bool cache_dont_need
;
322 map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > deletes
;
324 RecoveryHandle(): cache_dont_need(false) {}
325 virtual ~RecoveryHandle() {}
328 /// Get a fresh recovery operation
329 virtual RecoveryHandle
*open_recovery_op() = 0;
331 /// run_recovery_op: finish the operation represented by h
332 virtual void run_recovery_op(
333 RecoveryHandle
*h
, ///< [in] op to finish
334 int priority
///< [in] msg priority
337 void recover_delete_object(const hobject_t
&oid
, eversion_t v
,
339 void send_recovery_deletes(int prio
,
340 const map
<pg_shard_t
, vector
<pair
<hobject_t
, eversion_t
> > > &deletes
);
345 * Triggers a recovery operation on the specified hobject_t
346 * onreadable must be called before onwriteable
348 * On each replica (primary included), get_parent()->on_not_missing()
349 * must be called when the transaction finalizing the recovery
350 * is queued. Similarly, get_parent()->on_readable() must be called
351 * when the transaction is applied in the backing store.
353 * get_parent()->on_not_degraded() should be called on the primary
354 * when writes can resume on the object.
356 * obc may be NULL if the primary lacks the object.
358 * head may be NULL only if the head/snapdir is missing
360 * @param missing [in] set of info, missing pairs for queried nodes
361 * @param overlaps [in] mapping of object to file offset overlaps
363 virtual int recover_object(
364 const hobject_t
&hoid
, ///< [in] object to recover
365 eversion_t v
, ///< [in] version to recover
366 ObjectContextRef head
, ///< [in] context of the head/snapdir object
367 ObjectContextRef obc
, ///< [in] context of the object
368 RecoveryHandle
*h
///< [in,out] handle to attach recovery op to
372 * true if PGBackend can handle this message while inactive
374 * If it returns true, handle_message *must* also return true
376 virtual bool can_handle_while_inactive(OpRequestRef op
) = 0;
378 /// gives PGBackend a crack at an incoming message
380 OpRequestRef op
///< [in] message received
381 ); ///< @return true if the message was handled
383 /// the variant of handle_message that is overridden by child classes
384 virtual bool _handle_message(OpRequestRef op
) = 0;
386 virtual void check_recovery_sources(const OSDMapRef
& osdmap
) = 0;
390 * clean up any temporary on-disk state due to a pg interval change
392 void on_change_cleanup(ObjectStore::Transaction
*t
);
394 * implementation should clear itself, contexts blessed prior to on_change
395 * won't be called after on_change()
397 virtual void on_change() = 0;
398 virtual void clear_recovery_state() = 0;
400 virtual void on_flushed() = 0;
402 virtual IsPGRecoverablePredicate
*get_is_recoverable_predicate() = 0;
403 virtual IsPGReadablePredicate
*get_is_readable_predicate() = 0;
405 virtual void dump_recovery_info(Formatter
*f
) const = 0;
408 set
<hobject_t
> temp_contents
;
410 // Track contents of temp collection, clear on reset
411 void add_temp_obj(const hobject_t
&oid
) {
412 temp_contents
.insert(oid
);
414 void add_temp_objs(const set
<hobject_t
> &oids
) {
415 temp_contents
.insert(oids
.begin(), oids
.end());
417 void clear_temp_obj(const hobject_t
&oid
) {
418 temp_contents
.erase(oid
);
420 void clear_temp_objs(const set
<hobject_t
> &oids
) {
421 for (set
<hobject_t
>::const_iterator i
= oids
.begin();
424 temp_contents
.erase(*i
);
428 virtual ~PGBackend() {}
430 /// execute implementation specific transaction
431 virtual void submit_transaction(
432 const hobject_t
&hoid
, ///< [in] object
433 const object_stat_sum_t
&delta_stats
,///< [in] stat change
434 const eversion_t
&at_version
, ///< [in] version
435 PGTransactionUPtr
&&t
, ///< [in] trans to execute (move)
436 const eversion_t
&trim_to
, ///< [in] trim log to here
437 const eversion_t
&roll_forward_to
, ///< [in] trim rollback info to here
438 const vector
<pg_log_entry_t
> &log_entries
, ///< [in] log entries for t
439 /// [in] hitset history (if updated with this transaction)
440 boost::optional
<pg_hit_set_history_t
> &hset_history
,
441 Context
*on_local_applied_sync
, ///< [in] called when applied locally
442 Context
*on_all_applied
, ///< [in] called when all acked
443 Context
*on_all_commit
, ///< [in] called when all commit
444 ceph_tid_t tid
, ///< [in] tid
445 osd_reqid_t reqid
, ///< [in] reqid
446 OpRequestRef op
///< [in] op
449 /// submit callback to be called in order with pending writes
450 virtual void call_write_ordered(std::function
<void(void)> &&cb
) = 0;
453 const hobject_t
&hoid
,
455 ObjectStore::Transaction
*t
);
458 const pg_log_entry_t
&entry
,
459 ObjectStore::Transaction
*t
);
461 friend class LRBTrimmer
;
463 const pg_log_entry_t
&entry
,
464 ObjectStore::Transaction
*t
);
467 const pg_log_entry_t
&entry
,
468 ObjectStore::Transaction
*t
);
471 const hobject_t
&hoid
,
472 ObjectStore::Transaction
*t
);
476 void handle_recovery_delete(OpRequestRef op
);
477 void handle_recovery_delete_reply(OpRequestRef op
);
479 /// Reapply old attributes
480 void rollback_setattrs(
481 const hobject_t
&hoid
,
482 map
<string
, boost::optional
<bufferlist
> > &old_attrs
,
483 ObjectStore::Transaction
*t
);
485 /// Truncate object to rollback append
486 virtual void rollback_append(
487 const hobject_t
&hoid
,
489 ObjectStore::Transaction
*t
);
491 /// Unstash object to rollback stash
493 const hobject_t
&hoid
,
494 version_t old_version
,
495 ObjectStore::Transaction
*t
);
497 /// Unstash object to rollback stash
498 void rollback_try_stash(
499 const hobject_t
&hoid
,
500 version_t old_version
,
501 ObjectStore::Transaction
*t
);
503 /// Delete object to rollback create
504 void rollback_create(
505 const hobject_t
&hoid
,
506 ObjectStore::Transaction
*t
) {
510 /// Clone the extents back into place
511 void rollback_extents(
513 const vector
<pair
<uint64_t, uint64_t> > &extents
,
514 const hobject_t
&hoid
,
515 ObjectStore::Transaction
*t
);
518 /// Trim object stashed at version
519 void trim_rollback_object(
520 const hobject_t
&hoid
,
522 ObjectStore::Transaction
*t
);
524 /// List objects in collection
525 int objects_list_partial(
526 const hobject_t
&begin
,
529 vector
<hobject_t
> *ls
,
532 int objects_list_range(
533 const hobject_t
&start
,
534 const hobject_t
&end
,
536 vector
<hobject_t
> *ls
,
537 vector
<ghobject_t
> *gen_obs
=0);
539 int objects_get_attr(
540 const hobject_t
&hoid
,
544 virtual int objects_get_attrs(
545 const hobject_t
&hoid
,
546 map
<string
, bufferlist
> *out
);
548 virtual int objects_read_sync(
549 const hobject_t
&hoid
,
555 virtual void objects_read_async(
556 const hobject_t
&hoid
,
557 const list
<pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
558 pair
<bufferlist
*, Context
*> > > &to_read
,
559 Context
*on_complete
, bool fast_read
= false) = 0;
561 virtual bool scrub_supported() = 0;
562 virtual bool auto_repair_supported() const = 0;
565 ScrubMapBuilder
&pos
);
566 bool be_compare_scrub_objects(
567 pg_shard_t auth_shard
,
568 const ScrubMap::object
&auth
,
569 const object_info_t
& auth_oi
,
570 const ScrubMap::object
&candidate
,
571 shard_info_wrapper
& shard_error
,
572 inconsistent_obj_wrapper
&result
,
573 ostream
&errorstream
);
574 map
<pg_shard_t
, ScrubMap
*>::const_iterator
be_select_auth_object(
575 const hobject_t
&obj
,
576 const map
<pg_shard_t
,ScrubMap
*> &maps
,
577 object_info_t
*auth_oi
,
578 map
<pg_shard_t
, shard_info_wrapper
> &shard_map
,
579 inconsistent_obj_wrapper
&object_error
);
580 void be_compare_scrubmaps(
581 const map
<pg_shard_t
,ScrubMap
*> &maps
,
582 const set
<hobject_t
> &master_set
,
584 map
<hobject_t
, set
<pg_shard_t
>> &missing
,
585 map
<hobject_t
, set
<pg_shard_t
>> &inconsistent
,
586 map
<hobject_t
, list
<pg_shard_t
>> &authoritative
,
587 map
<hobject_t
, pair
<boost::optional
<uint32_t>,
588 boost::optional
<uint32_t>>> &missing_digest
,
589 int &shallow_errors
, int &deep_errors
,
592 const vector
<int> &acting
,
593 ostream
&errorstream
);
594 virtual uint64_t be_get_ondisk_size(
595 uint64_t logical_size
) = 0;
596 virtual int be_deep_scrub(
597 const hobject_t
&oid
,
599 ScrubMapBuilder
&pos
,
600 ScrubMap::object
&o
) = 0;
601 void be_large_omap_check(
602 const map
<pg_shard_t
,ScrubMap
*> &maps
,
603 const set
<hobject_t
> &master_set
,
604 int& large_omap_objects
,
605 ostream
&warnstream
) const;
607 static PGBackend
*build_pg_backend(
608 const pg_pool_t
&pool
,
609 const OSDMapRef curmap
,
612 ObjectStore::CollectionHandle
&ch
,