1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank Storage, Inc.
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/intrusive/set.hpp>
19 #include <boost/intrusive/list.hpp>
22 #include "PGBackend.h"
23 #include "erasure-code/ErasureCodeInterface.h"
25 #include "ECTransaction.h"
26 #include "ExtentCache.h"
30 struct ECSubWriteReply
;
32 struct ECSubReadReply
;
34 struct RecoveryMessages
;
35 class ECBackend
: public PGBackend
{
37 RecoveryHandle
*open_recovery_op() override
;
45 const hobject_t
&hoid
,
47 ObjectContextRef head
,
55 bool can_handle_while_inactive(
58 friend struct SubWriteApplied
;
59 friend struct SubWriteCommitted
;
60 void sub_write_applied(
63 const ZTracer::Trace
&trace
);
64 void sub_write_committed(
67 eversion_t last_complete
,
68 const ZTracer::Trace
&trace
);
69 void handle_sub_write(
73 const ZTracer::Trace
&trace
,
74 Context
*on_local_applied_sync
= 0
79 ECSubReadReply
*reply
,
80 const ZTracer::Trace
&trace
82 void handle_sub_write_reply(
84 const ECSubWriteReply
&op
,
85 const ZTracer::Trace
&trace
87 void handle_sub_read_reply(
91 const ZTracer::Trace
&trace
95 void check_recovery_sources(const OSDMapRef
& osdmap
) override
;
97 void on_change() override
;
98 void clear_recovery_state() override
;
100 void on_flushed() override
;
102 void dump_recovery_info(Formatter
*f
) const override
;
104 void call_write_ordered(std::function
<void(void)> &&cb
) override
;
106 void submit_transaction(
107 const hobject_t
&hoid
,
108 const object_stat_sum_t
&delta_stats
,
109 const eversion_t
&at_version
,
110 PGTransactionUPtr
&&t
,
111 const eversion_t
&trim_to
,
112 const eversion_t
&roll_forward_to
,
113 const vector
<pg_log_entry_t
> &log_entries
,
114 boost::optional
<pg_hit_set_history_t
> &hset_history
,
115 Context
*on_local_applied_sync
,
116 Context
*on_all_applied
,
117 Context
*on_all_commit
,
123 int objects_read_sync(
124 const hobject_t
&hoid
,
128 bufferlist
*bl
) override
;
131 * Async read mechanism
133 * Async reads use the same async read mechanism as does recovery.
134 * CallClientContexts is responsible for reconstructing the response
135 * buffer as well as for calling the callbacks.
137 * One tricky bit is that two reads may possibly not read from the same
138 * set of replicas. This could result in two reads completing in the
139 * wrong (from the interface user's point of view) order. Thus, we
140 * maintain a queue of in progress reads (@see in_progress_client_reads)
141 * to ensure that we always call the completion callback in order.
143 * Another subtely is that while we may read a degraded object, we will
144 * still only perform a client read from shards in the acting set. This
145 * ensures that we won't ever have to restart a client initiated read in
146 * check_recovery_sources.
148 void objects_read_and_reconstruct(
149 const map
<hobject_t
, std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> >
152 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
);
154 friend struct CallClientContexts
;
155 struct ClientAsyncReadStatus
{
156 unsigned objects_to_read
;
157 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> func
;
158 map
<hobject_t
,pair
<int, extent_map
> > results
;
159 explicit ClientAsyncReadStatus(
160 unsigned objects_to_read
,
161 GenContextURef
<map
<hobject_t
,pair
<int, extent_map
> > &&> &&func
)
162 : objects_to_read(objects_to_read
), func(std::move(func
)) {}
163 void complete_object(
164 const hobject_t
&hoid
,
166 extent_map
&&buffers
) {
167 assert(objects_to_read
);
169 assert(!results
.count(hoid
));
170 results
.emplace(hoid
, make_pair(err
, std::move(buffers
)));
172 bool is_complete() const {
173 return objects_to_read
== 0;
176 func
.release()->complete(std::move(results
));
179 list
<ClientAsyncReadStatus
> in_progress_client_reads
;
180 void objects_read_async(
181 const hobject_t
&hoid
,
182 const list
<pair
<boost::tuple
<uint64_t, uint64_t, uint32_t>,
183 pair
<bufferlist
*, Context
*> > > &to_read
,
184 Context
*on_complete
,
185 bool fast_read
= false) override
;
187 template <typename Func
>
188 void objects_read_async_no_cache(
189 const map
<hobject_t
,extent_set
> &to_read
,
190 Func
&&on_complete
) {
191 map
<hobject_t
,std::list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > > _to_read
;
192 for (auto &&hpair
: to_read
) {
193 auto &l
= _to_read
[hpair
.first
];
194 for (auto extent
: hpair
.second
) {
195 l
.emplace_back(extent
.first
, extent
.second
, 0);
198 objects_read_and_reconstruct(
201 make_gen_lambda_context
<
202 map
<hobject_t
,pair
<int, extent_map
> > &&, Func
>(
203 std::forward
<Func
>(on_complete
)));
206 while (in_progress_client_reads
.size() &&
207 in_progress_client_reads
.front().is_complete()) {
208 in_progress_client_reads
.front().run();
209 in_progress_client_reads
.pop_front();
214 friend struct ECRecoveryHandle
;
215 uint64_t get_recovery_chunk_size() const {
216 return ROUND_UP_TO(cct
->_conf
->osd_recovery_max_chunk
,
217 sinfo
.get_stripe_width());
220 void get_want_to_read_shards(set
<int> *want_to_read
) const {
221 const vector
<int> &chunk_mapping
= ec_impl
->get_chunk_mapping();
222 for (int i
= 0; i
< (int)ec_impl
->get_data_chunk_count(); ++i
) {
223 int chunk
= (int)chunk_mapping
.size() > i
? chunk_mapping
[i
] : i
;
224 want_to_read
->insert(chunk
);
231 * Recovery uses the same underlying read mechanism as client reads
232 * with the slight difference that recovery reads may come from non
233 * acting shards. Thus, check_recovery_sources may wind up calling
234 * cancel_pull for a read originating with RecoveryOp.
236 * The recovery process is expressed as a state machine:
237 * - IDLE: Nothing is currently in progress, reads will be started and
238 * we will transition to READING
239 * - READING: We are awaiting a pending read op. Once complete, we will
240 * decode the buffers and proceed to WRITING
241 * - WRITING: We are awaiting a completed push. Once complete, we will
242 * either transition to COMPLETE or to IDLE to continue.
243 * - COMPLETE: complete
245 * We use the existing Push and PushReply messages and structures to
246 * handle actually shuffling the data over to the replicas. recovery_info
247 * and recovery_progress are expressed in terms of the logical offset
248 * space except for data_included which is in terms of the chunked object
249 * space (to match the passed buffer).
251 * xattrs are requested on the first read and used to initialize the
252 * object_context if missing on completion of the first read.
254 * In order to batch up reads and writes, we batch Push, PushReply,
255 * Transaction, and reads in a RecoveryMessages object which is passed
256 * among the recovery methods.
261 set
<pg_shard_t
> missing_on
;
262 set
<shard_id_t
> missing_on_shards
;
264 ObjectRecoveryInfo recovery_info
;
265 ObjectRecoveryProgress recovery_progress
;
267 enum state_t
{ IDLE
, READING
, WRITING
, COMPLETE
} state
;
269 static const char* tostr(state_t state
) {
271 case ECBackend::RecoveryOp::IDLE
:
274 case ECBackend::RecoveryOp::READING
:
277 case ECBackend::RecoveryOp::WRITING
:
280 case ECBackend::RecoveryOp::COMPLETE
:
289 // must be filled if state == WRITING
290 map
<int, bufferlist
> returned_data
;
291 map
<string
, bufferlist
> xattrs
;
292 ECUtil::HashInfoRef hinfo
;
293 ObjectContextRef obc
;
294 set
<pg_shard_t
> waiting_on_pushes
;
296 // valid in state READING
297 pair
<uint64_t, uint64_t> extent_requested
;
299 void dump(Formatter
*f
) const;
301 RecoveryOp() : state(IDLE
) {}
303 friend ostream
&operator<<(ostream
&lhs
, const RecoveryOp
&rhs
);
304 map
<hobject_t
, RecoveryOp
> recovery_ops
;
306 void continue_recovery_op(
308 RecoveryMessages
*m
);
309 void dispatch_recovery_messages(RecoveryMessages
&m
, int priority
);
310 friend struct OnRecoveryReadComplete
;
311 void handle_recovery_read_complete(
312 const hobject_t
&hoid
,
313 boost::tuple
<uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > &to_read
,
314 boost::optional
<map
<string
, bufferlist
> > attrs
,
315 RecoveryMessages
*m
);
316 void handle_recovery_push(
318 RecoveryMessages
*m
);
319 void handle_recovery_push_reply(
320 const PushReplyOp
&op
,
322 RecoveryMessages
*m
);
323 void get_all_avail_shards(
324 const hobject_t
&hoid
,
325 const set
<pg_shard_t
> &error_shards
,
327 map
<shard_id_t
, pg_shard_t
> &shards
,
332 * Low level async read mechanism
334 * To avoid duplicating the logic for requesting and waiting for
335 * multiple object shards, there is a common async read mechanism
336 * taking a map of hobject_t->read_request_t which defines callbacks
337 * taking read_result_ts as arguments.
339 * tid_to_read_map gives open read ops. check_recovery_sources uses
340 * shard_to_read_map and ReadOp::source_to_obj to restart reads
341 * involving down osds.
343 * The user is responsible for specifying replicas on which to read
344 * and for reassembling the buffer on the other side since client
345 * reads require the original object buffer while recovery only needs
346 * the missing pieces.
348 * Rather than handling reads on the primary directly, we simply send
349 * ourselves a message. This avoids a dedicated primary path for that
352 struct read_result_t
{
354 map
<pg_shard_t
, int> errors
;
355 boost::optional
<map
<string
, bufferlist
> > attrs
;
358 uint64_t, uint64_t, map
<pg_shard_t
, bufferlist
> > > returned
;
359 read_result_t() : r(0) {}
361 struct read_request_t
{
362 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > to_read
;
363 const set
<pg_shard_t
> need
;
364 const bool want_attrs
;
365 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
;
367 const list
<boost::tuple
<uint64_t, uint64_t, uint32_t> > &to_read
,
368 const set
<pg_shard_t
> &need
,
370 GenContext
<pair
<RecoveryMessages
*, read_result_t
& > &> *cb
)
371 : to_read(to_read
), need(need
), want_attrs(want_attrs
),
374 friend ostream
&operator<<(ostream
&lhs
, const read_request_t
&rhs
);
379 OpRequestRef op
; // may be null if not on behalf of a client
380 // True if redundant reads are issued, false otherwise,
381 // this is useful to tradeoff some resources (redundant ops) for
382 // low latency read, especially on relatively idle cluster
383 bool do_redundant_reads
;
384 // True if reading for recovery which could possibly reading only a subset
385 // of the available shards.
388 ZTracer::Trace trace
;
390 map
<hobject_t
, set
<int>> want_to_read
;
391 map
<hobject_t
, read_request_t
> to_read
;
392 map
<hobject_t
, read_result_t
> complete
;
394 map
<hobject_t
, set
<pg_shard_t
>> obj_to_source
;
395 map
<pg_shard_t
, set
<hobject_t
> > source_to_obj
;
397 void dump(Formatter
*f
) const;
399 set
<pg_shard_t
> in_progress
;
404 bool do_redundant_reads
,
407 map
<hobject_t
, set
<int>> &&_want_to_read
,
408 map
<hobject_t
, read_request_t
> &&_to_read
)
409 : priority(priority
), tid(tid
), op(op
), do_redundant_reads(do_redundant_reads
),
410 for_recovery(for_recovery
), want_to_read(std::move(_want_to_read
)),
411 to_read(std::move(_to_read
)) {
412 for (auto &&hpair
: to_read
) {
413 auto &returned
= complete
[hpair
.first
].returned
;
414 for (auto &&extent
: hpair
.second
.to_read
) {
419 map
<pg_shard_t
, bufferlist
>()));
424 ReadOp(const ReadOp
&) = default;
425 ReadOp(ReadOp
&&) = default;
427 friend struct FinishReadOp
;
429 const OSDMapRef
& osdmap
,
431 void complete_read_op(ReadOp
&rop
, RecoveryMessages
*m
);
432 friend ostream
&operator<<(ostream
&lhs
, const ReadOp
&rhs
);
433 map
<ceph_tid_t
, ReadOp
> tid_to_read_map
;
434 map
<pg_shard_t
, set
<ceph_tid_t
> > shard_to_read_map
;
437 map
<hobject_t
, set
<int>> &want_to_read
,
438 map
<hobject_t
, read_request_t
> &to_read
,
440 bool do_redundant_reads
, bool for_recovery
);
442 void do_read_op(ReadOp
&rop
);
443 int send_all_remaining_reads(
444 const hobject_t
&hoid
,
451 * ECTransaction is responsible for generating a transaction for
452 * each shard to which we need to send the write. As required
453 * by the PGBackend interface, the ECBackend write mechanism
454 * passes trim information with the write and last_complete back
457 * As with client reads, there is a possibility of out-of-order
458 * completions. Thus, callbacks and completion are called in order
459 * on the writing list.
461 struct Op
: boost::intrusive::list_base_hook
<> {
462 /// From submit_transaction caller, decribes operation
464 object_stat_sum_t delta_stats
;
467 boost::optional
<pg_hit_set_history_t
> updated_hit_set_history
;
468 vector
<pg_log_entry_t
> log_entries
;
471 ZTracer::Trace trace
;
473 eversion_t roll_forward_to
; /// Soon to be generated internally
475 /// Ancillary also provided from submit_transaction caller
476 map
<hobject_t
, ObjectContextRef
> obc_map
;
478 /// see call_write_ordered
479 std::list
<std::function
<void(void)> > on_write
;
481 /// Generated internally
482 set
<hobject_t
> temp_added
;
483 set
<hobject_t
> temp_cleared
;
485 ECTransaction::WritePlan plan
;
486 bool requires_rmw() const { return !plan
.to_read
.empty(); }
487 bool invalidates_cache() const { return plan
.invalidates_cache
; }
489 // must be true if requires_rmw(), must be false if invalidates_cache()
490 bool using_cache
= false;
492 /// In progress read state;
493 map
<hobject_t
,extent_set
> pending_read
; // subset already being read
494 map
<hobject_t
,extent_set
> remote_read
; // subset we must read
495 map
<hobject_t
,extent_map
> remote_read_result
;
496 bool read_in_progress() const {
497 return !remote_read
.empty() && remote_read_result
.empty();
500 /// In progress write state
501 set
<pg_shard_t
> pending_commit
;
502 set
<pg_shard_t
> pending_apply
;
503 bool write_in_progress() const {
504 return !pending_commit
.empty() || !pending_apply
.empty();
507 /// optional, may be null, for tracking purposes
508 OpRequestRef client_op
;
511 ExtentCache::write_pin pin
;
514 Context
*on_local_applied_sync
= nullptr;
515 Context
*on_all_applied
= nullptr;
516 Context
*on_all_commit
= nullptr;
518 delete on_local_applied_sync
;
519 delete on_all_applied
;
520 delete on_all_commit
;
523 using op_list
= boost::intrusive::list
<Op
>;
524 friend ostream
&operator<<(ostream
&lhs
, const Op
&rhs
);
527 map
<ceph_tid_t
, Op
> tid_to_op_map
; /// Owns Op structure
530 * We model the possible rmw states as a set of waitlists.
531 * All writes at this time complete in order, so a write blocked
532 * at waiting_state blocks all writes behind it as well (same for
535 * Future work: We can break this up into a per-object pipeline
536 * (almost). First, provide an ordering token to submit_transaction
537 * and require that all operations within a single transaction take
538 * place on a subset of hobject_t space partitioned by that token
539 * (the hashid seem about right to me -- even works for temp objects
540 * if you recall that a temp object created for object head foo will
541 * only ever be referenced by other transactions on foo and aren't
542 * reused). Next, factor this part into a class and maintain one per
543 * ordering token. Next, fixup PrimaryLogPG's repop queue to be
544 * partitioned by ordering token. Finally, refactor the op pipeline
545 * so that the log entries passed into submit_tranaction aren't
546 * versioned. We can't assign versions to them until we actually
547 * submit the operation. That's probably going to be the hard part.
549 class pipeline_state_t
{
553 } pipeline_state
= CACHE_VALID
;
555 bool caching_enabled() const {
556 return pipeline_state
== CACHE_VALID
;
558 bool cache_invalid() const {
559 return !caching_enabled();
562 pipeline_state
= CACHE_INVALID
;
565 pipeline_state
= CACHE_VALID
;
567 friend ostream
&operator<<(ostream
&lhs
, const pipeline_state_t
&rhs
);
571 op_list waiting_state
; /// writes waiting on pipe_state
572 op_list waiting_reads
; /// writes waiting on partial stripe reads
573 op_list waiting_commit
; /// writes waiting on initial commit
574 eversion_t completed_to
;
575 eversion_t committed_to
;
576 void start_rmw(Op
*op
, PGTransactionUPtr
&&t
);
577 bool try_state_to_reads();
578 bool try_reads_to_commit();
579 bool try_finish_rmw();
582 ErasureCodeInterfaceRef ec_impl
;
588 * Determines the whether _have is suffient to recover an object
590 class ECRecPred
: public IsPGRecoverablePredicate
{
592 ErasureCodeInterfaceRef ec_impl
;
594 explicit ECRecPred(ErasureCodeInterfaceRef ec_impl
) : ec_impl(ec_impl
) {
595 for (unsigned i
= 0; i
< ec_impl
->get_chunk_count(); ++i
) {
599 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
601 for (set
<pg_shard_t
>::const_iterator i
= _have
.begin();
604 have
.insert(i
->shard
);
607 return ec_impl
->minimum_to_decode(want
, have
, &min
) == 0;
610 IsPGRecoverablePredicate
*get_is_recoverable_predicate() override
{
611 return new ECRecPred(ec_impl
);
617 * Determines the whether _have is suffient to read an object
619 class ECReadPred
: public IsPGReadablePredicate
{
625 ErasureCodeInterfaceRef ec_impl
) : whoami(whoami
), rec_pred(ec_impl
) {}
626 bool operator()(const set
<pg_shard_t
> &_have
) const override
{
627 return _have
.count(whoami
) && rec_pred(_have
);
630 IsPGReadablePredicate
*get_is_readable_predicate() override
{
631 return new ECReadPred(get_parent()->whoami_shard(), ec_impl
);
635 const ECUtil::stripe_info_t sinfo
;
636 /// If modified, ensure that the ref is held until the update is applied
637 SharedPtrRegistry
<hobject_t
, ECUtil::HashInfo
> unstable_hashinfo_registry
;
638 ECUtil::HashInfoRef
get_hash_info(const hobject_t
&hoid
, bool checks
= true,
639 const map
<string
,bufferptr
> *attr
= NULL
);
643 PGBackend::Listener
*pg
,
645 ObjectStore::CollectionHandle
&ch
,
648 ErasureCodeInterfaceRef ec_impl
,
649 uint64_t stripe_width
);
651 /// Returns to_read replicas sufficient to reconstruct want
652 int get_min_avail_to_read_shards(
653 const hobject_t
&hoid
, ///< [in] object
654 const set
<int> &want
, ///< [in] desired shards
655 bool for_recovery
, ///< [in] true if we may use non-acting replicas
656 bool do_redundant_reads
, ///< [in] true if we want to issue redundant reads to reduce latency
657 set
<pg_shard_t
> *to_read
///< [out] shards to read
658 ); ///< @return error code, 0 on success
660 int get_remaining_shards(
661 const hobject_t
&hoid
,
662 const set
<int> &avail
,
663 const set
<int> &want
,
664 const read_result_t
&result
,
665 set
<pg_shard_t
> *to_read
,
668 int objects_get_attrs(
669 const hobject_t
&hoid
,
670 map
<string
, bufferlist
> *out
) override
;
672 void rollback_append(
673 const hobject_t
&hoid
,
675 ObjectStore::Transaction
*t
) override
;
677 bool scrub_supported() override
{ return true; }
678 bool auto_repair_supported() const override
{ return true; }
681 const hobject_t
&poid
,
683 ScrubMapBuilder
&pos
,
684 ScrubMap::object
&o
) override
;
685 uint64_t be_get_ondisk_size(uint64_t logical_size
) override
{
686 return sinfo
.logical_to_next_chunk_offset(logical_size
);
688 void _failed_push(const hobject_t
&hoid
,
689 pair
<RecoveryMessages
*, ECBackend::read_result_t
&> &in
);
691 ostream
&operator<<(ostream
&lhs
, const ECBackend::pipeline_state_t
&rhs
);