1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank Storage, Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/intrusive/set.hpp>
19 #include <boost/intrusive/list.hpp>
22 #include "PGBackend.h"
23 #include "erasure-code/ErasureCodeInterface.h"
25 #include "ECTransaction.h"
26 #include "ExtentCache.h"
30 struct ECSubWriteReply
;
32 struct ECSubReadReply
;
34 struct RecoveryMessages
;
35 class ECBackend
: public PGBackend
{
37 RecoveryHandle
*open_recovery_op() override
;
45 const hobject_t
&hoid
,
47 ObjectContextRef head
,
55 bool can_handle_while_inactive(
58 friend struct SubWriteApplied
;
59 friend struct SubWriteCommitted
;
60 void sub_write_applied(
63 const ZTracer::Trace
&trace
);
64 void sub_write_committed(
67 eversion_t last_complete
,
68 const ZTracer::Trace
&trace
);
69 void handle_sub_write(
73 const ZTracer::Trace
&trace
,
74 Context
*on_local_applied_sync
= 0
79 ECSubReadReply
*reply
,
80 const ZTracer::Trace
&trace
82 void handle_sub_write_reply(
84 const ECSubWriteReply
&op
,
85 const ZTracer::Trace
&trace
87 void handle_sub_read_reply(
91 const ZTracer::Trace
&trace
95 void check_recovery_sources(const OSDMapRef
& osdmap
) override
;
97 void on_change() override
;
98 void clear_recovery_state() override
;
100 void on_flushed() override
;
102 void dump_recovery_info(Formatter
*f
) const override
;
104 void call_write_ordered(std::function
<void(void)> &&cb
) override
;
106 void submit_transaction(
107 const hobject_t
&hoid
,
108 const object_stat_sum_t
&delta_stats
,
109 const eversion_t
&at_version
,
110 PGTransactionUPtr
&&t
,
111 const eversion_t
&trim_to
,
112 const eversion_t
&roll_forward_to
,
113 const vector
<pg_log_entry_t
> &log_entries
,
114 boost::optional
<pg_hit_set_history_t
> &hset_history
,
115 Context
*on_local_applied_sync
,
116 Context
*on_all_applied
,
117 Context
*on_all_commit
,
123 int objects_read_sync(
124 const hobject_t
&hoid
,
128 bufferlist
*bl
) override
;
131 * Async read mechanism
133 * Async reads use the same async read mechanism as does recovery.
134 * CallClientContexts is responsible for reconstructing the response
135 * buffer as well as for calling the callbacks.
137 * One tricky bit is that two reads may possibly not read from the same
138 * set of replicas. This could result in two reads completing in the
139 * wrong (from the interface user's point of view) order. Thus, we
140 * maintain a queue of in progress reads (@see in_progress_client_reads)
141 * to ensure that we always call the completion callback in order.
143 * Another subtely is that while we may read a degraded object, we will
144 * still only perform a client read from shards in the acting set. This
145 * ensures that we won't ever have to restart a client initiated read in
146 * check_recovery_sources.
148 void objects_read_and_reconstruct(
149 const map
<hobject_t
, std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> >
152 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
);
154 friend struct CallClientContexts
;
155 struct ClientAsyncReadStatus
{
156 unsigned objects_to_read
;
157 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> func
;
158 map
<hobject_t
,pair
<int, extent_map
> > results
;
159 explicit ClientAsyncReadStatus(
160 unsigned objects_to_read
,
161 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
)
162 : objects_to_read(objects_to_read
), func(std::move(func
)) {}
163 void complete_object(
164 const hobject_t
&hoid
,
166 extent_map
&&buffers
) {
167 assert(objects_to_read
);
169 assert(!results
.count(hoid
));
170 results
.emplace(hoid
, make_pair(err
, std::move(buffers
)));
172 bool is_complete() const {
173 return objects_to_read
== 0;
176 func
.release()->complete(std::move(results
));
179 list
<ClientAsyncReadStatus
> in_progress_client_reads
;
180 void objects_read_async(
181 const hobject_t
&hoid
,
182 const list
<pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
183 pair
<bufferlist
*, Context
*> > > &to_read
,
184 Context
*on_complete
,
185 bool fast_read
= false) override
;
187 template <typename Func
>
188 void objects_read_async_no_cache(
189 const map
<hobject_t
,extent_set
> &to_read
,
190 Func
&&on_complete
) {
191 map
<hobject_t
,std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > > _to_read
;
192 for (auto &&hpair
: to_read
) {
193 auto &l
= _to_read
[hpair
.first
];
194 for (auto extent
: hpair
.second
) {
195 l
.emplace_back(extent
.first
, extent
.second
, 0);
198 objects_read_and_reconstruct(
201 make_gen_lambda_context
<
202 map
<hobject_t
,pair
<int, extent_map
> > &&, Func
>(
203 std::forward
<Func
>(on_complete
)));
206 while (in_progress_client_reads
.size() &&
207 in_progress_client_reads
.front().is_complete()) {
208 in_progress_client_reads
.front().run();
209 in_progress_client_reads
.pop_front();
214 friend struct ECRecoveryHandle
;
215 uint64_t get_recovery_chunk_size() const {
216 return ROUND_UP_TO(cct
->_conf
->osd_recovery_max_chunk
,
217 sinfo
.get_stripe_width());
220 void get_want_to_read_shards(set
<int> *want_to_read
) const {
221 const vector
<int> &chunk_mapping
= ec_impl
->get_chunk_mapping();
222 for (int i
= 0; i
< (int)ec_impl
->get_data_chunk_count(); ++i
) {
223 int chunk
= (int)chunk_mapping
.size() > i
? chunk_mapping
[i
] : i
;
224 want_to_read
->insert(chunk
);
231 * Recovery uses the same underlying read mechanism as client reads
232 * with the slight difference that recovery reads may come from non
233 * acting shards. Thus, check_recovery_sources may wind up calling
234 * cancel_pull for a read originating with RecoveryOp.
236 * The recovery process is expressed as a state machine:
237 * - IDLE: Nothing is currently in progress, reads will be started and
238 * we will transition to READING
239 * - READING: We are awaiting a pending read op. Once complete, we will
240 * decode the buffers and proceed to WRITING
241 * - WRITING: We are awaiting a completed push. Once complete, we will
242 * either transition to COMPLETE or to IDLE to continue.
243 * - COMPLETE: complete
245 * We use the existing Push and PushReply messages and structures to
246 * handle actually shuffling the data over to the replicas. recovery_info
247 * and recovery_progress are expressed in terms of the logical offset
248 * space except for data_included which is in terms of the chunked object
249 * space (to match the passed buffer).
251 * xattrs are requested on the first read and used to initialize the
252 * object_context if missing on completion of the first read.
254 * In order to batch up reads and writes, we batch Push, PushReply,
255 * Transaction, and reads in a RecoveryMessages object which is passed
256 * among the recovery methods.
261 set
<pg_shard_t
> missing_on
;
262 set
<shard_id_t
> missing_on_shards
;
264 ObjectRecoveryInfo recovery_info
;
265 ObjectRecoveryProgress recovery_progress
;
267 enum state_t
{ IDLE
, READING
, WRITING
, COMPLETE
} state
;
269 static const char* tostr(state_t state
) {
271 case ECBackend::RecoveryOp::IDLE
:
274 case ECBackend::RecoveryOp::READING
:
277 case ECBackend::RecoveryOp::WRITING
:
280 case ECBackend::RecoveryOp::COMPLETE
:
289 // must be filled if state == WRITING
290 map
<int, bufferlist
> returned_data
;
291 map
<string
, bufferlist
> xattrs
;
292 ECUtil::HashInfoRef hinfo
;
293 ObjectContextRef obc
;
294 set
<pg_shard_t
> waiting_on_pushes
;
296 // valid in state READING
297 pair
<uint64_t, uint64_t> extent_requested
;
299 void dump(Formatter
*f
) const;
301 RecoveryOp() : state(IDLE
) {}
303 friend ostream
&operator<<(ostream
&lhs
, const RecoveryOp
&rhs
);
304 map
<hobject_t
, RecoveryOp
> recovery_ops
;
306 void continue_recovery_op(
308 RecoveryMessages
*m
);
309 void dispatch_recovery_messages(RecoveryMessages
&m
, int priority
);
310 friend struct OnRecoveryReadComplete
;
311 void handle_recovery_read_complete(
312 const hobject_t
&hoid
,
313 boost::tuple
<uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > &to_read
,
314 boost::optional
<map
<string
, bufferlist
> > attrs
,
315 RecoveryMessages
*m
);
316 void handle_recovery_push(
318 RecoveryMessages
*m
);
319 void handle_recovery_push_reply(
320 const PushReplyOp
&op
,
322 RecoveryMessages
*m
);
323 void get_all_avail_shards(
324 const hobject_t
&hoid
,
326 map
<shard_id_t
, pg_shard_t
> &shards
,
331 * Low level async read mechanism
333 * To avoid duplicating the logic for requesting and waiting for
334 * multiple object shards, there is a common async read mechanism
335 * taking a map of hobject_t->read_request_t which defines callbacks
336 * taking read_result_ts as arguments.
338 * tid_to_read_map gives open read ops. check_recovery_sources uses
339 * shard_to_read_map and ReadOp::source_to_obj to restart reads
340 * involving down osds.
342 * The user is responsible for specifying replicas on which to read
343 * and for reassembling the buffer on the other side since client
344 * reads require the original object buffer while recovery only needs
345 * the missing pieces.
347 * Rather than handling reads on the primary directly, we simply send
348 * ourselves a message. This avoids a dedicated primary path for that
351 struct read_result_t
{
353 map
<pg_shard_t
, int> errors
;
354 boost::optional
<map
<string
, bufferlist
> > attrs
;
357 uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > > returned
;
358 read_result_t() : r(0) {}
360 struct read_request_t
{
361 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > to_read
;
362 const set
<pg_shard_t
> need
;
363 const bool want_attrs
;
364 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
;
366 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > &to_read
,
367 const set
<pg_shard_t
> &need
,
369 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
)
370 : to_read(to_read
), need(need
), want_attrs(want_attrs
),
373 friend ostream
&operator<<(ostream
&lhs
, const read_request_t
&rhs
);
378 OpRequestRef op
; // may be null if not on behalf of a client
379 // True if redundant reads are issued, false otherwise,
380 // this is useful to tradeoff some resources (redundant ops) for
381 // low latency read, especially on relatively idle cluster
382 bool do_redundant_reads
;
383 // True if reading for recovery which could possibly reading only a subset
384 // of the available shards.
387 ZTracer::Trace trace
;
389 map
<hobject_t
, read_request_t
> to_read
;
390 map
<hobject_t
, read_result_t
> complete
;
392 map
<hobject_t
, set
<pg_shard_t
>> obj_to_source
;
393 map
<pg_shard_t
, set
<hobject_t
> > source_to_obj
;
395 void dump(Formatter
*f
) const;
397 set
<pg_shard_t
> in_progress
;
402 bool do_redundant_reads
,
405 map
<hobject_t
, read_request_t
> &&_to_read
)
406 : priority(priority
), tid(tid
), op(op
), do_redundant_reads(do_redundant_reads
),
407 for_recovery(for_recovery
), to_read(std::move(_to_read
)) {
408 for (auto &&hpair
: to_read
) {
409 auto &returned
= complete
[hpair
.first
].returned
;
410 for (auto &&extent
: hpair
.second
.to_read
) {
415 map
<pg_shard_t
, bufferlist
>()));
420 ReadOp(const ReadOp
&) = default;
421 ReadOp(ReadOp
&&) = default;
423 friend struct FinishReadOp
;
425 const OSDMapRef
& osdmap
,
427 void complete_read_op(ReadOp
&rop
, RecoveryMessages
*m
);
428 friend ostream
&operator<<(ostream
&lhs
, const ReadOp
&rhs
);
429 map
<ceph_tid_t
, ReadOp
> tid_to_read_map
;
430 map
<pg_shard_t
, set
<ceph_tid_t
> > shard_to_read_map
;
433 map
<hobject_t
, read_request_t
> &to_read
,
435 bool do_redundant_reads
, bool for_recovery
);
437 void do_read_op(ReadOp
&rop
);
438 int send_all_remaining_reads(
439 const hobject_t
&hoid
,
446 * ECTransaction is responsible for generating a transaction for
447 * each shard to which we need to send the write. As required
448 * by the PGBackend interface, the ECBackend write mechanism
449 * passes trim information with the write and last_complete back
452 * As with client reads, there is a possibility of out-of-order
453 * completions. Thus, callbacks and completion are called in order
454 * on the writing list.
456 struct Op
: boost::intrusive::list_base_hook
<> {
457 /// From submit_transaction caller, decribes operation
459 object_stat_sum_t delta_stats
;
462 boost::optional
<pg_hit_set_history_t
> updated_hit_set_history
;
463 vector
<pg_log_entry_t
> log_entries
;
466 ZTracer::Trace trace
;
468 eversion_t roll_forward_to
; /// Soon to be generated internally
470 /// Ancillary also provided from submit_transaction caller
471 map
<hobject_t
, ObjectContextRef
> obc_map
;
473 /// see call_write_ordered
474 std::list
<std::function
<void(void)> > on_write
;
476 /// Generated internally
477 set
<hobject_t
> temp_added
;
478 set
<hobject_t
> temp_cleared
;
480 ECTransaction::WritePlan plan
;
481 bool requires_rmw() const { return !plan
.to_read
.empty(); }
482 bool invalidates_cache() const { return plan
.invalidates_cache
; }
484 // must be true if requires_rmw(), must be false if invalidates_cache()
485 bool using_cache
= false;
487 /// In progress read state;
488 map
<hobject_t
,extent_set
> pending_read
; // subset already being read
489 map
<hobject_t
,extent_set
> remote_read
; // subset we must read
490 map
<hobject_t
,extent_map
> remote_read_result
;
491 bool read_in_progress() const {
492 return !remote_read
.empty() && remote_read_result
.empty();
495 /// In progress write state
496 set
<pg_shard_t
> pending_commit
;
497 set
<pg_shard_t
> pending_apply
;
498 bool write_in_progress() const {
499 return !pending_commit
.empty() || !pending_apply
.empty();
502 /// optional, may be null, for tracking purposes
503 OpRequestRef client_op
;
506 ExtentCache::write_pin pin
;
509 Context
*on_local_applied_sync
= nullptr;
510 Context
*on_all_applied
= nullptr;
511 Context
*on_all_commit
= nullptr;
513 delete on_local_applied_sync
;
514 delete on_all_applied
;
515 delete on_all_commit
;
518 using op_list
= boost::intrusive::list
<Op
>;
519 friend ostream
&operator<<(ostream
&lhs
, const Op
&rhs
);
522 map
<ceph_tid_t
, Op
> tid_to_op_map
; /// Owns Op structure
525 * We model the possible rmw states as a set of waitlists.
526 * All writes at this time complete in order, so a write blocked
527 * at waiting_state blocks all writes behind it as well (same for
530 * Future work: We can break this up into a per-object pipeline
531 * (almost). First, provide an ordering token to submit_transaction
532 * and require that all operations within a single transaction take
533 * place on a subset of hobject_t space partitioned by that token
534 * (the hashid seem about right to me -- even works for temp objects
535 * if you recall that a temp object created for object head foo will
536 * only ever be referenced by other transactions on foo and aren't
537 * reused). Next, factor this part into a class and maintain one per
538 * ordering token. Next, fixup PrimaryLogPG's repop queue to be
539 * partitioned by ordering token. Finally, refactor the op pipeline
540 * so that the log entries passed into submit_tranaction aren't
541 * versioned. We can't assign versions to them until we actually
542 * submit the operation. That's probably going to be the hard part.
544 class pipeline_state_t
{
548 } pipeline_state
= CACHE_VALID
;
550 bool caching_enabled() const {
551 return pipeline_state
== CACHE_VALID
;
553 bool cache_invalid() const {
554 return !caching_enabled();
557 pipeline_state
= CACHE_INVALID
;
560 pipeline_state
= CACHE_VALID
;
562 friend ostream
&operator<<(ostream
&lhs
, const pipeline_state_t
&rhs
);
566 op_list waiting_state
; /// writes waiting on pipe_state
567 op_list waiting_reads
; /// writes waiting on partial stripe reads
568 op_list waiting_commit
; /// writes waiting on initial commit
569 eversion_t completed_to
;
570 eversion_t committed_to
;
571 void start_rmw(Op
*op
, PGTransactionUPtr
&&t
);
572 bool try_state_to_reads();
573 bool try_reads_to_commit();
574 bool try_finish_rmw();
577 ErasureCodeInterfaceRef ec_impl
;
583 * Determines the whether _have is suffient to recover an object
585 class ECRecPred
: public IsPGRecoverablePredicate
{
587 ErasureCodeInterfaceRef ec_impl
;
589 explicit ECRecPred(ErasureCodeInterfaceRef ec_impl
) : ec_impl(ec_impl
) {
590 for (unsigned i
= 0; i
< ec_impl
->get_chunk_count(); ++i
) {
594 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
596 for (set
<pg_shard_t
>::const_iterator i
= _have
.begin();
599 have
.insert(i
->shard
);
602 return ec_impl
->minimum_to_decode(want
, have
, &min
) == 0;
605 IsPGRecoverablePredicate
*get_is_recoverable_predicate() override
{
606 return new ECRecPred(ec_impl
);
612 * Determines the whether _have is suffient to read an object
614 class ECReadPred
: public IsPGReadablePredicate
{
620 ErasureCodeInterfaceRef ec_impl
) : whoami(whoami
), rec_pred(ec_impl
) {}
621 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
622 return _have
.count(whoami
) && rec_pred(_have
);
625 IsPGReadablePredicate
*get_is_readable_predicate() override
{
626 return new ECReadPred(get_parent()->whoami_shard(), ec_impl
);
630 const ECUtil::stripe_info_t sinfo
;
631 /// If modified, ensure that the ref is held until the update is applied
632 SharedPtrRegistry
<hobject_t
, ECUtil::HashInfo
> unstable_hashinfo_registry
;
633 ECUtil::HashInfoRef
get_hash_info(const hobject_t
&hoid
, bool checks
= true,
634 const map
<string
,bufferptr
> *attr
= NULL
);
638 PGBackend::Listener
*pg
,
640 ObjectStore::CollectionHandle
&ch
,
643 ErasureCodeInterfaceRef ec_impl
,
644 uint64_t stripe_width
);
646 /// Returns to_read replicas sufficient to reconstruct want
647 int get_min_avail_to_read_shards(
648 const hobject_t
&hoid
, ///< [in] object
649 const set
<int> &want
, ///< [in] desired shards
650 bool for_recovery
, ///< [in] true if we may use non-acting replicas
651 bool do_redundant_reads
, ///< [in] true if we want to issue redundant reads to reduce latency
652 set
<pg_shard_t
> *to_read
///< [out] shards to read
653 ); ///< @return error code, 0 on success
655 int get_remaining_shards(
656 const hobject_t
&hoid
,
657 const set
<int> &avail
,
658 set
<pg_shard_t
> *to_read
,
661 int objects_get_attrs(
662 const hobject_t
&hoid
,
663 map
<string
, bufferlist
> *out
) override
;
665 void rollback_append(
666 const hobject_t
&hoid
,
668 ObjectStore::Transaction
*t
) override
;
670 bool scrub_supported() override
{ return true; }
671 bool auto_repair_supported() const override
{ return true; }
674 const hobject_t
&obj
,
677 ThreadPool::TPHandle
&handle
) override
;
678 uint64_t be_get_ondisk_size(uint64_t logical_size
) override
{
679 return sinfo
.logical_to_next_chunk_offset(logical_size
);
681 void _failed_push(const hobject_t
&hoid
,
682 pair
<RecoveryMessages
*, ECBackend::read_result_t
&> &in
);
684 ostream
&operator<<(ostream
&lhs
, const ECBackend::pipeline_state_t
&rhs
);