1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank Storage, Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/intrusive/set.hpp>
19 #include <boost/intrusive/list.hpp>
22 #include "PGBackend.h"
23 #include "erasure-code/ErasureCodeInterface.h"
25 #include "ECTransaction.h"
26 #include "ExtentCache.h"
30 struct ECSubWriteReply
;
32 struct ECSubReadReply
;
34 struct RecoveryMessages
;
35 class ECBackend
: public PGBackend
{
37 RecoveryHandle
*open_recovery_op() override
;
45 const hobject_t
&hoid
,
47 ObjectContextRef head
,
55 bool can_handle_while_inactive(
58 friend struct SubWriteApplied
;
59 friend struct SubWriteCommitted
;
60 void sub_write_applied(
63 const ZTracer::Trace
&trace
);
64 void sub_write_committed(
67 eversion_t last_complete
,
68 const ZTracer::Trace
&trace
);
69 void handle_sub_write(
73 const ZTracer::Trace
&trace
,
74 Context
*on_local_applied_sync
= 0
79 ECSubReadReply
*reply
,
80 const ZTracer::Trace
&trace
82 void handle_sub_write_reply(
84 const ECSubWriteReply
&op
,
85 const ZTracer::Trace
&trace
87 void handle_sub_read_reply(
91 const ZTracer::Trace
&trace
95 void check_recovery_sources(const OSDMapRef
& osdmap
) override
;
97 void on_change() override
;
98 void clear_recovery_state() override
;
100 void on_flushed() override
;
102 void dump_recovery_info(Formatter
*f
) const override
;
104 void call_write_ordered(std::function
<void(void)> &&cb
) override
;
106 void submit_transaction(
107 const hobject_t
&hoid
,
108 const object_stat_sum_t
&delta_stats
,
109 const eversion_t
&at_version
,
110 PGTransactionUPtr
&&t
,
111 const eversion_t
&trim_to
,
112 const eversion_t
&roll_forward_to
,
113 const vector
<pg_log_entry_t
> &log_entries
,
114 boost::optional
<pg_hit_set_history_t
> &hset_history
,
115 Context
*on_local_applied_sync
,
116 Context
*on_all_applied
,
117 Context
*on_all_commit
,
123 int objects_read_sync(
124 const hobject_t
&hoid
,
128 bufferlist
*bl
) override
;
131 * Async read mechanism
133 * Async reads use the same async read mechanism as does recovery.
134 * CallClientContexts is responsible for reconstructing the response
135 * buffer as well as for calling the callbacks.
137 * One tricky bit is that two reads may possibly not read from the same
138 * set of replicas. This could result in two reads completing in the
139 * wrong (from the interface user's point of view) order. Thus, we
140 * maintain a queue of in progress reads (@see in_progress_client_reads)
141 * to ensure that we always call the completion callback in order.
143 * Another subtely is that while we may read a degraded object, we will
144 * still only perform a client read from shards in the acting set. This
145 * ensures that we won't ever have to restart a client initiated read in
146 * check_recovery_sources.
148 void objects_read_and_reconstruct(
149 const map
<hobject_t
, std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> >
152 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
);
154 friend struct CallClientContexts
;
155 struct ClientAsyncReadStatus
{
156 unsigned objects_to_read
;
157 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> func
;
158 map
<hobject_t
,pair
<int, extent_map
> > results
;
159 explicit ClientAsyncReadStatus(
160 unsigned objects_to_read
,
161 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
)
162 : objects_to_read(objects_to_read
), func(std::move(func
)) {}
163 void complete_object(
164 const hobject_t
&hoid
,
166 extent_map
&&buffers
) {
167 assert(objects_to_read
);
169 assert(!results
.count(hoid
));
170 results
.emplace(hoid
, make_pair(err
, std::move(buffers
)));
172 bool is_complete() const {
173 return objects_to_read
== 0;
176 func
.release()->complete(std::move(results
));
179 list
<ClientAsyncReadStatus
> in_progress_client_reads
;
180 void objects_read_async(
181 const hobject_t
&hoid
,
182 const list
<pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
183 pair
<bufferlist
*, Context
*> > > &to_read
,
184 Context
*on_complete
,
185 bool fast_read
= false) override
;
187 template <typename Func
>
188 void objects_read_async_no_cache(
189 const map
<hobject_t
,extent_set
> &to_read
,
190 Func
&&on_complete
) {
191 map
<hobject_t
,std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > > _to_read
;
192 for (auto &&hpair
: to_read
) {
193 auto &l
= _to_read
[hpair
.first
];
194 for (auto extent
: hpair
.second
) {
195 l
.emplace_back(extent
.first
, extent
.second
, 0);
198 objects_read_and_reconstruct(
201 make_gen_lambda_context
<
202 map
<hobject_t
,pair
<int, extent_map
> > &&, Func
>(
203 std::forward
<Func
>(on_complete
)));
206 while (in_progress_client_reads
.size() &&
207 in_progress_client_reads
.front().is_complete()) {
208 in_progress_client_reads
.front().run();
209 in_progress_client_reads
.pop_front();
214 friend struct ECRecoveryHandle
;
215 uint64_t get_recovery_chunk_size() const {
216 return ROUND_UP_TO(cct
->_conf
->osd_recovery_max_chunk
,
217 sinfo
.get_stripe_width());
220 void get_want_to_read_shards(set
<int> *want_to_read
) const {
221 const vector
<int> &chunk_mapping
= ec_impl
->get_chunk_mapping();
222 for (int i
= 0; i
< (int)ec_impl
->get_data_chunk_count(); ++i
) {
223 int chunk
= (int)chunk_mapping
.size() > i
? chunk_mapping
[i
] : i
;
224 want_to_read
->insert(chunk
);
231 * Recovery uses the same underlying read mechanism as client reads
232 * with the slight difference that recovery reads may come from non
233 * acting shards. Thus, check_recovery_sources may wind up calling
234 * cancel_pull for a read originating with RecoveryOp.
236 * The recovery process is expressed as a state machine:
237 * - IDLE: Nothing is currently in progress, reads will be started and
238 * we will transition to READING
239 * - READING: We are awaiting a pending read op. Once complete, we will
240 * decode the buffers and proceed to WRITING
241 * - WRITING: We are awaiting a completed push. Once complete, we will
242 * either transition to COMPLETE or to IDLE to continue.
243 * - COMPLETE: complete
245 * We use the existing Push and PushReply messages and structures to
246 * handle actually shuffling the data over to the replicas. recovery_info
247 * and recovery_progress are expressed in terms of the logical offset
248 * space except for data_included which is in terms of the chunked object
249 * space (to match the passed buffer).
251 * xattrs are requested on the first read and used to initialize the
252 * object_context if missing on completion of the first read.
254 * In order to batch up reads and writes, we batch Push, PushReply,
255 * Transaction, and reads in a RecoveryMessages object which is passed
256 * among the recovery methods.
261 set
<pg_shard_t
> missing_on
;
262 set
<shard_id_t
> missing_on_shards
;
264 ObjectRecoveryInfo recovery_info
;
265 ObjectRecoveryProgress recovery_progress
;
267 enum state_t
{ IDLE
, READING
, WRITING
, COMPLETE
} state
;
269 static const char* tostr(state_t state
) {
271 case ECBackend::RecoveryOp::IDLE
:
274 case ECBackend::RecoveryOp::READING
:
277 case ECBackend::RecoveryOp::WRITING
:
280 case ECBackend::RecoveryOp::COMPLETE
:
289 // must be filled if state == WRITING
290 map
<int, bufferlist
> returned_data
;
291 map
<string
, bufferlist
> xattrs
;
292 ECUtil::HashInfoRef hinfo
;
293 ObjectContextRef obc
;
294 set
<pg_shard_t
> waiting_on_pushes
;
296 // valid in state READING
297 pair
<uint64_t, uint64_t> extent_requested
;
299 void dump(Formatter
*f
) const;
301 RecoveryOp() : state(IDLE
) {}
303 friend ostream
&operator<<(ostream
&lhs
, const RecoveryOp
&rhs
);
304 map
<hobject_t
, RecoveryOp
> recovery_ops
;
306 void continue_recovery_op(
308 RecoveryMessages
*m
);
309 void dispatch_recovery_messages(RecoveryMessages
&m
, int priority
);
310 friend struct OnRecoveryReadComplete
;
311 void handle_recovery_read_complete(
312 const hobject_t
&hoid
,
313 boost::tuple
<uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > &to_read
,
314 boost::optional
<map
<string
, bufferlist
> > attrs
,
315 RecoveryMessages
*m
);
316 void handle_recovery_push(
318 RecoveryMessages
*m
);
319 void handle_recovery_push_reply(
320 const PushReplyOp
&op
,
322 RecoveryMessages
*m
);
326 * Low level async read mechanism
328 * To avoid duplicating the logic for requesting and waiting for
329 * multiple object shards, there is a common async read mechanism
330 * taking a map of hobject_t->read_request_t which defines callbacks
331 * taking read_result_ts as arguments.
333 * tid_to_read_map gives open read ops. check_recovery_sources uses
334 * shard_to_read_map and ReadOp::source_to_obj to restart reads
335 * involving down osds.
337 * The user is responsible for specifying replicas on which to read
338 * and for reassembling the buffer on the other side since client
339 * reads require the original object buffer while recovery only needs
340 * the missing pieces.
342 * Rather than handling reads on the primary directly, we simply send
343 * ourselves a message. This avoids a dedicated primary path for that
346 struct read_result_t
{
348 map
<pg_shard_t
, int> errors
;
349 boost::optional
<map
<string
, bufferlist
> > attrs
;
352 uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > > returned
;
353 read_result_t() : r(0) {}
355 struct read_request_t
{
356 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > to_read
;
357 const set
<pg_shard_t
> need
;
358 const bool want_attrs
;
359 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
;
361 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > &to_read
,
362 const set
<pg_shard_t
> &need
,
364 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
)
365 : to_read(to_read
), need(need
), want_attrs(want_attrs
),
368 friend ostream
&operator<<(ostream
&lhs
, const read_request_t
&rhs
);
373 OpRequestRef op
; // may be null if not on behalf of a client
374 // True if redundant reads are issued, false otherwise,
375 // this is useful to tradeoff some resources (redundant ops) for
376 // low latency read, especially on relatively idle cluster
377 bool do_redundant_reads
;
378 // True if reading for recovery which could possibly reading only a subset
379 // of the available shards.
382 ZTracer::Trace trace
;
384 map
<hobject_t
, read_request_t
> to_read
;
385 map
<hobject_t
, read_result_t
> complete
;
387 map
<hobject_t
, set
<pg_shard_t
>> obj_to_source
;
388 map
<pg_shard_t
, set
<hobject_t
> > source_to_obj
;
390 void dump(Formatter
*f
) const;
392 set
<pg_shard_t
> in_progress
;
397 bool do_redundant_reads
,
400 map
<hobject_t
, read_request_t
> &&_to_read
)
401 : priority(priority
), tid(tid
), op(op
), do_redundant_reads(do_redundant_reads
),
402 for_recovery(for_recovery
), to_read(std::move(_to_read
)) {
403 for (auto &&hpair
: to_read
) {
404 auto &returned
= complete
[hpair
.first
].returned
;
405 for (auto &&extent
: hpair
.second
.to_read
) {
410 map
<pg_shard_t
, bufferlist
>()));
415 ReadOp(const ReadOp
&) = default;
416 ReadOp(ReadOp
&&) = default;
418 friend struct FinishReadOp
;
420 const OSDMapRef
& osdmap
,
422 void complete_read_op(ReadOp
&rop
, RecoveryMessages
*m
);
423 friend ostream
&operator<<(ostream
&lhs
, const ReadOp
&rhs
);
424 map
<ceph_tid_t
, ReadOp
> tid_to_read_map
;
425 map
<pg_shard_t
, set
<ceph_tid_t
> > shard_to_read_map
;
428 map
<hobject_t
, read_request_t
> &to_read
,
430 bool do_redundant_reads
, bool for_recovery
);
432 void do_read_op(ReadOp
&rop
);
433 int send_all_remaining_reads(
434 const hobject_t
&hoid
,
441 * ECTransaction is responsible for generating a transaction for
442 * each shard to which we need to send the write. As required
443 * by the PGBackend interface, the ECBackend write mechanism
444 * passes trim information with the write and last_complete back
447 * As with client reads, there is a possibility of out-of-order
448 * completions. Thus, callbacks and completion are called in order
449 * on the writing list.
451 struct Op
: boost::intrusive::list_base_hook
<> {
452 /// From submit_transaction caller, decribes operation
454 object_stat_sum_t delta_stats
;
457 boost::optional
<pg_hit_set_history_t
> updated_hit_set_history
;
458 vector
<pg_log_entry_t
> log_entries
;
461 ZTracer::Trace trace
;
463 eversion_t roll_forward_to
; /// Soon to be generated internally
465 /// Ancillary also provided from submit_transaction caller
466 map
<hobject_t
, ObjectContextRef
> obc_map
;
468 /// see call_write_ordered
469 std::list
<std::function
<void(void)> > on_write
;
471 /// Generated internally
472 set
<hobject_t
> temp_added
;
473 set
<hobject_t
> temp_cleared
;
475 ECTransaction::WritePlan plan
;
476 bool requires_rmw() const { return !plan
.to_read
.empty(); }
477 bool invalidates_cache() const { return plan
.invalidates_cache
; }
479 // must be true if requires_rmw(), must be false if invalidates_cache()
480 bool using_cache
= false;
482 /// In progress read state;
483 map
<hobject_t
,extent_set
> pending_read
; // subset already being read
484 map
<hobject_t
,extent_set
> remote_read
; // subset we must read
485 map
<hobject_t
,extent_map
> remote_read_result
;
486 bool read_in_progress() const {
487 return !remote_read
.empty() && remote_read_result
.empty();
490 /// In progress write state
491 set
<pg_shard_t
> pending_commit
;
492 set
<pg_shard_t
> pending_apply
;
493 bool write_in_progress() const {
494 return !pending_commit
.empty() || !pending_apply
.empty();
497 /// optional, may be null, for tracking purposes
498 OpRequestRef client_op
;
501 ExtentCache::write_pin pin
;
504 Context
*on_local_applied_sync
= nullptr;
505 Context
*on_all_applied
= nullptr;
506 Context
*on_all_commit
= nullptr;
508 delete on_local_applied_sync
;
509 delete on_all_applied
;
510 delete on_all_commit
;
513 using op_list
= boost::intrusive::list
<Op
>;
514 friend ostream
&operator<<(ostream
&lhs
, const Op
&rhs
);
517 map
<ceph_tid_t
, Op
> tid_to_op_map
; /// Owns Op structure
520 * We model the possible rmw states as a set of waitlists.
521 * All writes at this time complete in order, so a write blocked
522 * at waiting_state blocks all writes behind it as well (same for
525 * Future work: We can break this up into a per-object pipeline
526 * (almost). First, provide an ordering token to submit_transaction
527 * and require that all operations within a single transaction take
528 * place on a subset of hobject_t space partitioned by that token
529 * (the hashid seem about right to me -- even works for temp objects
530 * if you recall that a temp object created for object head foo will
531 * only ever be referenced by other transactions on foo and aren't
532 * reused). Next, factor this part into a class and maintain one per
533 * ordering token. Next, fixup PrimaryLogPG's repop queue to be
534 * partitioned by ordering token. Finally, refactor the op pipeline
535 * so that the log entries passed into submit_tranaction aren't
536 * versioned. We can't assign versions to them until we actually
537 * submit the operation. That's probably going to be the hard part.
539 class pipeline_state_t
{
543 } pipeline_state
= CACHE_VALID
;
545 bool caching_enabled() const {
546 return pipeline_state
== CACHE_VALID
;
548 bool cache_invalid() const {
549 return !caching_enabled();
552 pipeline_state
= CACHE_INVALID
;
555 pipeline_state
= CACHE_VALID
;
557 friend ostream
&operator<<(ostream
&lhs
, const pipeline_state_t
&rhs
);
561 op_list waiting_state
; /// writes waiting on pipe_state
562 op_list waiting_reads
; /// writes waiting on partial stripe reads
563 op_list waiting_commit
; /// writes waiting on initial commit
564 eversion_t completed_to
;
565 eversion_t committed_to
;
566 void start_rmw(Op
*op
, PGTransactionUPtr
&&t
);
567 bool try_state_to_reads();
568 bool try_reads_to_commit();
569 bool try_finish_rmw();
572 ErasureCodeInterfaceRef ec_impl
;
578 * Determines the whether _have is suffient to recover an object
580 class ECRecPred
: public IsPGRecoverablePredicate
{
582 ErasureCodeInterfaceRef ec_impl
;
584 explicit ECRecPred(ErasureCodeInterfaceRef ec_impl
) : ec_impl(ec_impl
) {
585 for (unsigned i
= 0; i
< ec_impl
->get_chunk_count(); ++i
) {
589 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
591 for (set
<pg_shard_t
>::const_iterator i
= _have
.begin();
594 have
.insert(i
->shard
);
597 return ec_impl
->minimum_to_decode(want
, have
, &min
) == 0;
600 IsPGRecoverablePredicate
*get_is_recoverable_predicate() override
{
601 return new ECRecPred(ec_impl
);
607 * Determines the whether _have is suffient to read an object
609 class ECReadPred
: public IsPGReadablePredicate
{
615 ErasureCodeInterfaceRef ec_impl
) : whoami(whoami
), rec_pred(ec_impl
) {}
616 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
617 return _have
.count(whoami
) && rec_pred(_have
);
620 IsPGReadablePredicate
*get_is_readable_predicate() override
{
621 return new ECReadPred(get_parent()->whoami_shard(), ec_impl
);
625 const ECUtil::stripe_info_t sinfo
;
626 /// If modified, ensure that the ref is held until the update is applied
627 SharedPtrRegistry
<hobject_t
, ECUtil::HashInfo
> unstable_hashinfo_registry
;
628 ECUtil::HashInfoRef
get_hash_info(const hobject_t
&hoid
, bool checks
= true,
629 const map
<string
,bufferptr
> *attr
= NULL
);
633 PGBackend::Listener
*pg
,
635 ObjectStore::CollectionHandle
&ch
,
638 ErasureCodeInterfaceRef ec_impl
,
639 uint64_t stripe_width
);
641 /// Returns to_read replicas sufficient to reconstruct want
642 int get_min_avail_to_read_shards(
643 const hobject_t
&hoid
, ///< [in] object
644 const set
<int> &want
, ///< [in] desired shards
645 bool for_recovery
, ///< [in] true if we may use non-acting replicas
646 bool do_redundant_reads
, ///< [in] true if we want to issue redundant reads to reduce latency
647 set
<pg_shard_t
> *to_read
///< [out] shards to read
648 ); ///< @return error code, 0 on success
650 int get_remaining_shards(
651 const hobject_t
&hoid
,
652 const set
<int> &avail
,
653 set
<pg_shard_t
> *to_read
);
655 int objects_get_attrs(
656 const hobject_t
&hoid
,
657 map
<string
, bufferlist
> *out
) override
;
659 void rollback_append(
660 const hobject_t
&hoid
,
662 ObjectStore::Transaction
*t
) override
;
664 bool scrub_supported() override
{ return true; }
665 bool auto_repair_supported() const override
{ return true; }
668 const hobject_t
&obj
,
671 ThreadPool::TPHandle
&handle
) override
;
672 uint64_t be_get_ondisk_size(uint64_t logical_size
) override
{
673 return sinfo
.logical_to_next_chunk_offset(logical_size
);
675 void _failed_push(const hobject_t
&hoid
,
676 pair
<RecoveryMessages
*, ECBackend::read_result_t
&> &in
);
678 ostream
&operator<<(ostream
&lhs
, const ECBackend::pipeline_state_t
&rhs
);