ceph/src/osd/ECBackend.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2013 Inktank Storage, Inc.
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #ifndef ECBACKEND_H
  16 #define ECBACKEND_H
  17
  18 #include <boost/intrusive/set.hpp>
  19 #include <boost/intrusive/list.hpp>
  20
  21 #include "OSD.h"
  22 #include "PGBackend.h"
  23 #include "erasure-code/ErasureCodeInterface.h"
  24 #include "ECUtil.h"
  25 #include "ECTransaction.h"
  26 #include "ExtentCache.h"
  27
  28 //forward declaration
  29 struct ECSubWrite;
  30 struct ECSubWriteReply;
  31 struct ECSubRead;
  32 struct ECSubReadReply;
  33
  34 struct RecoveryMessages;
  35 class ECBackend : public PGBackend {
  36 public:
  37   RecoveryHandle *open_recovery_op() override;
  38
  39   void run_recovery_op(
  40     RecoveryHandle *h,
  41     int priority
  42     ) override;
  43
  44   int recover_object(
  45     const hobject_t &hoid,
  46     eversion_t v,
  47     ObjectContextRef head,
  48     ObjectContextRef obc,
  49     RecoveryHandle *h
  50     ) override;
  51
  52   bool _handle_message(
  53     OpRequestRef op
  54     ) override;
  55   bool can_handle_while_inactive(
  56     OpRequestRef op
  57     ) override;
  58   friend struct SubWriteApplied;
  59   friend struct SubWriteCommitted;
  60   void sub_write_committed(
  61     ceph_tid_t tid,
  62     eversion_t version,
  63     eversion_t last_complete,
  64     const ZTracer::Trace &trace);
  65   void handle_sub_write(
  66     pg_shard_t from,
  67     OpRequestRef msg,
  68     ECSubWrite &op,
  69     const ZTracer::Trace &trace
  70     );
  71   void handle_sub_read(
  72     pg_shard_t from,
  73     const ECSubRead &op,
  74     ECSubReadReply *reply,
  75     const ZTracer::Trace &trace
  76     );
  77   void handle_sub_write_reply(
  78     pg_shard_t from,
  79     const ECSubWriteReply &op,
  80     const ZTracer::Trace &trace
  81     );
  82   void handle_sub_read_reply(
  83     pg_shard_t from,
  84     ECSubReadReply &op,
  85     RecoveryMessages *m,
  86     const ZTracer::Trace &trace
  87     );
  88
  89   /// @see ReadOp below
  90   void check_recovery_sources(const OSDMapRef& osdmap) override;
  91
  92   void on_change() override;
  93   void clear_recovery_state() override;
  94
  95   void dump_recovery_info(Formatter *f) const override;
  96
  97   void call_write_ordered(std::function<void(void)> &&cb) override;
  98
  99   void submit_transaction(
 100     const hobject_t &hoid,
 101     const object_stat_sum_t &delta_stats,
 102     const eversion_t &at_version,
 103     PGTransactionUPtr &&t,
 104     const eversion_t &trim_to,
 105     const eversion_t &roll_forward_to,
 106     const vector<pg_log_entry_t> &log_entries,
 107     boost::optional<pg_hit_set_history_t> &hset_history,
 108     Context *on_all_commit,
 109     ceph_tid_t tid,
 110     osd_reqid_t reqid,
 111     OpRequestRef op
 112     ) override;
 113
 114   int objects_read_sync(
 115     const hobject_t &hoid,
 116     uint64_t off,
 117     uint64_t len,
 118     uint32_t op_flags,
 119     bufferlist *bl) override;
 120
 121   /**
 122    * Async read mechanism
 123    *
 124    * Async reads use the same async read mechanism as does recovery.
 125    * CallClientContexts is responsible for reconstructing the response
 126    * buffer as well as for calling the callbacks.
 127    *
 128    * One tricky bit is that two reads may possibly not read from the same
 129    * set of replicas.  This could result in two reads completing in the
 130    * wrong (from the interface user's point of view) order.  Thus, we
 131    * maintain a queue of in progress reads (@see in_progress_client_reads)
 132    * to ensure that we always call the completion callback in order.
 133    *
 134    * Another subtly is that while we may read a degraded object, we will
 135    * still only perform a client read from shards in the acting set.  This
 136    * ensures that we won't ever have to restart a client initiated read in
 137    * check_recovery_sources.
 138    */
 139   void objects_read_and_reconstruct(
 140     const map<hobject_t, std::list<boost::tuple<uint64_t, uint64_t, uint32_t> >
 141     > &reads,
 142     bool fast_read,
 143     GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func);
 144
 145   friend struct CallClientContexts;
 146   struct ClientAsyncReadStatus {
 147     unsigned objects_to_read;
 148     GenContextURef<map<hobject_t,pair<int, extent_map> > &&> func;
 149     map<hobject_t,pair<int, extent_map> > results;
 150     explicit ClientAsyncReadStatus(
 151       unsigned objects_to_read,
 152       GenContextURef<map<hobject_t,pair<int, extent_map> > &&> &&func)
 153       : objects_to_read(objects_to_read), func(std::move(func)) {}
 154     void complete_object(
 155       const hobject_t &hoid,
 156       int err,
 157       extent_map &&buffers) {
 158       ceph_assert(objects_to_read);
 159       --objects_to_read;
 160       ceph_assert(!results.count(hoid));
 161       results.emplace(hoid, make_pair(err, std::move(buffers)));
 162     }
 163     bool is_complete() const {
 164       return objects_to_read == 0;
 165     }
 166     void run() {
 167       func.release()->complete(std::move(results));
 168     }
 169   };
 170   list<ClientAsyncReadStatus> in_progress_client_reads;
 171   void objects_read_async(
 172     const hobject_t &hoid,
 173     const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
 174                     pair<bufferlist*, Context*> > > &to_read,
 175     Context *on_complete,
 176     bool fast_read = false) override;
 177
 178   template <typename Func>
 179   void objects_read_async_no_cache(
 180     const map<hobject_t,extent_set> &to_read,
 181     Func &&on_complete) {
 182     map<hobject_t,std::list<boost::tuple<uint64_t, uint64_t, uint32_t> > > _to_read;
 183     for (auto &&hpair: to_read) {
 184       auto &l = _to_read[hpair.first];
 185       for (auto extent: hpair.second) {
 186         l.emplace_back(extent.first, extent.second, 0);
 187       }
 188     }
 189     objects_read_and_reconstruct(
 190       _to_read,
 191       false,
 192       make_gen_lambda_context<
 193       map<hobject_t,pair<int, extent_map> > &&, Func>(
 194           std::forward<Func>(on_complete)));
 195   }
 196   void kick_reads() {
 197     while (in_progress_client_reads.size() &&
 198            in_progress_client_reads.front().is_complete()) {
 199       in_progress_client_reads.front().run();
 200       in_progress_client_reads.pop_front();
 201     }
 202   }
 203
 204 private:
 205   friend struct ECRecoveryHandle;
 206   uint64_t get_recovery_chunk_size() const {
 207     return round_up_to(cct->_conf->osd_recovery_max_chunk,
 208                         sinfo.get_stripe_width());
 209   }
 210
 211   void get_want_to_read_shards(set<int> *want_to_read) const {
 212     const vector<int> &chunk_mapping = ec_impl->get_chunk_mapping();
 213     for (int i = 0; i < (int)ec_impl->get_data_chunk_count(); ++i) {
 214       int chunk = (int)chunk_mapping.size() > i ? chunk_mapping[i] : i;
 215       want_to_read->insert(chunk);
 216     }
 217   }
 218
 219   /**
 220    * Recovery
 221    *
 222    * Recovery uses the same underlying read mechanism as client reads
 223    * with the slight difference that recovery reads may come from non
 224    * acting shards.  Thus, check_recovery_sources may wind up calling
 225    * cancel_pull for a read originating with RecoveryOp.
 226    *
 227    * The recovery process is expressed as a state machine:
 228    * - IDLE: Nothing is currently in progress, reads will be started and
 229    *         we will transition to READING
 230    * - READING: We are awaiting a pending read op.  Once complete, we will
 231    *            decode the buffers and proceed to WRITING
 232    * - WRITING: We are awaiting a completed push.  Once complete, we will
 233    *            either transition to COMPLETE or to IDLE to continue.
 234    * - COMPLETE: complete
 235    *
 236    * We use the existing Push and PushReply messages and structures to
 237    * handle actually shuffling the data over to the replicas.  recovery_info
 238    * and recovery_progress are expressed in terms of the logical offset
 239    * space except for data_included which is in terms of the chunked object
 240    * space (to match the passed buffer).
 241    *
 242    * xattrs are requested on the first read and used to initialize the
 243    * object_context if missing on completion of the first read.
 244    *
 245    * In order to batch up reads and writes, we batch Push, PushReply,
 246    * Transaction, and reads in a RecoveryMessages object which is passed
 247    * among the recovery methods.
 248    */
 249   struct RecoveryOp {
 250     hobject_t hoid;
 251     eversion_t v;
 252     set<pg_shard_t> missing_on;
 253     set<shard_id_t> missing_on_shards;
 254
 255     ObjectRecoveryInfo recovery_info;
 256     ObjectRecoveryProgress recovery_progress;
 257
 258     enum state_t { IDLE, READING, WRITING, COMPLETE } state;
 259
 260     static const char* tostr(state_t state) {
 261       switch (state) {
 262       case ECBackend::RecoveryOp::IDLE:
 263         return "IDLE";
 264         break;
 265       case ECBackend::RecoveryOp::READING:
 266         return "READING";
 267         break;
 268       case ECBackend::RecoveryOp::WRITING:
 269         return "WRITING";
 270         break;
 271       case ECBackend::RecoveryOp::COMPLETE:
 272         return "COMPLETE";
 273         break;
 274       default:
 275         ceph_abort();
 276         return "";
 277       }
 278     }
 279
 280     // must be filled if state == WRITING
 281     map<int, bufferlist> returned_data;
 282     map<string, bufferlist> xattrs;
 283     ECUtil::HashInfoRef hinfo;
 284     ObjectContextRef obc;
 285     set<pg_shard_t> waiting_on_pushes;
 286
 287     // valid in state READING
 288     pair<uint64_t, uint64_t> extent_requested;
 289
 290     void dump(Formatter *f) const;
 291
 292     RecoveryOp() : state(IDLE) {}
 293   };
 294   friend ostream &operator<<(ostream &lhs, const RecoveryOp &rhs);
 295   map<hobject_t, RecoveryOp> recovery_ops;
 296
 297   void continue_recovery_op(
 298     RecoveryOp &op,
 299     RecoveryMessages *m);
 300   void dispatch_recovery_messages(RecoveryMessages &m, int priority);
 301   friend struct OnRecoveryReadComplete;
 302   void handle_recovery_read_complete(
 303     const hobject_t &hoid,
 304     boost::tuple<uint64_t, uint64_t, map<pg_shard_t, bufferlist> > &to_read,
 305     boost::optional<map<string, bufferlist> > attrs,
 306     RecoveryMessages *m);
 307   void handle_recovery_push(
 308     const PushOp &op,
 309     RecoveryMessages *m,
 310     bool is_repair);
 311   void handle_recovery_push_reply(
 312     const PushReplyOp &op,
 313     pg_shard_t from,
 314     RecoveryMessages *m);
 315   void get_all_avail_shards(
 316     const hobject_t &hoid,
 317     const set<pg_shard_t> &error_shards,
 318     set<int> &have,
 319     map<shard_id_t, pg_shard_t> &shards,
 320     bool for_recovery);
 321
 322 public:
 323   /**
 324    * Low level async read mechanism
 325    *
 326    * To avoid duplicating the logic for requesting and waiting for
 327    * multiple object shards, there is a common async read mechanism
 328    * taking a map of hobject_t->read_request_t which defines callbacks
 329    * taking read_result_ts as arguments.
 330    *
 331    * tid_to_read_map gives open read ops.  check_recovery_sources uses
 332    * shard_to_read_map and ReadOp::source_to_obj to restart reads
 333    * involving down osds.
 334    *
 335    * The user is responsible for specifying replicas on which to read
 336    * and for reassembling the buffer on the other side since client
 337    * reads require the original object buffer while recovery only needs
 338    * the missing pieces.
 339    *
 340    * Rather than handling reads on the primary directly, we simply send
 341    * ourselves a message.  This avoids a dedicated primary path for that
 342    * part.
 343    */
 344   struct read_result_t {
 345     int r;
 346     map<pg_shard_t, int> errors;
 347     boost::optional<map<string, bufferlist> > attrs;
 348     list<
 349       boost::tuple<
 350         uint64_t, uint64_t, map<pg_shard_t, bufferlist> > > returned;
 351     read_result_t() : r(0) {}
 352   };
 353   struct read_request_t {
 354     const list<boost::tuple<uint64_t, uint64_t, uint32_t> > to_read;
 355     const map<pg_shard_t, vector<pair<int, int>>> need;
 356     const bool want_attrs;
 357     GenContext<pair<RecoveryMessages *, read_result_t& > &> *cb;
 358     read_request_t(
 359       const list<boost::tuple<uint64_t, uint64_t, uint32_t> > &to_read,
 360       const map<pg_shard_t, vector<pair<int, int>>> &need,
 361       bool want_attrs,
 362       GenContext<pair<RecoveryMessages *, read_result_t& > &> *cb)
 363       : to_read(to_read), need(need), want_attrs(want_attrs),
 364         cb(cb) {}
 365   };
 366   friend ostream &operator<<(ostream &lhs, const read_request_t &rhs);
 367
 368   struct ReadOp {
 369     int priority;
 370     ceph_tid_t tid;
 371     OpRequestRef op; // may be null if not on behalf of a client
 372     // True if redundant reads are issued, false otherwise,
 373     // this is useful to tradeoff some resources (redundant ops) for
 374     // low latency read, especially on relatively idle cluster
 375     bool do_redundant_reads;
 376     // True if reading for recovery which could possibly reading only a subset
 377     // of the available shards.
 378     bool for_recovery;
 379
 380     ZTracer::Trace trace;
 381
 382     map<hobject_t, set<int>> want_to_read;
 383     map<hobject_t, read_request_t> to_read;
 384     map<hobject_t, read_result_t> complete;
 385
 386     map<hobject_t, set<pg_shard_t>> obj_to_source;
 387     map<pg_shard_t, set<hobject_t> > source_to_obj;
 388
 389     void dump(Formatter *f) const;
 390
 391     set<pg_shard_t> in_progress;
 392
 393     ReadOp(
 394       int priority,
 395       ceph_tid_t tid,
 396       bool do_redundant_reads,
 397       bool for_recovery,
 398       OpRequestRef op,
 399       map<hobject_t, set<int>> &&_want_to_read,
 400       map<hobject_t, read_request_t> &&_to_read)
 401       : priority(priority), tid(tid), op(op), do_redundant_reads(do_redundant_reads),
 402         for_recovery(for_recovery), want_to_read(std::move(_want_to_read)),
 403         to_read(std::move(_to_read)) {
 404       for (auto &&hpair: to_read) {
 405         auto &returned = complete[hpair.first].returned;
 406         for (auto &&extent: hpair.second.to_read) {
 407           returned.push_back(
 408             boost::make_tuple(
 409               extent.get<0>(),
 410               extent.get<1>(),
 411               map<pg_shard_t, bufferlist>()));
 412         }
 413       }
 414     }
 415     ReadOp() = delete;
 416     ReadOp(const ReadOp &) = default;
 417     ReadOp(ReadOp &&) = default;
 418   };
 419   friend struct FinishReadOp;
 420   void filter_read_op(
 421     const OSDMapRef& osdmap,
 422     ReadOp &op);
 423   void complete_read_op(ReadOp &rop, RecoveryMessages *m);
 424   friend ostream &operator<<(ostream &lhs, const ReadOp &rhs);
 425   map<ceph_tid_t, ReadOp> tid_to_read_map;
 426   map<pg_shard_t, set<ceph_tid_t> > shard_to_read_map;
 427   void start_read_op(
 428     int priority,
 429     map<hobject_t, set<int>> &want_to_read,
 430     map<hobject_t, read_request_t> &to_read,
 431     OpRequestRef op,
 432     bool do_redundant_reads, bool for_recovery);
 433
 434   void do_read_op(ReadOp &rop);
 435   int send_all_remaining_reads(
 436     const hobject_t &hoid,
 437     ReadOp &rop);
 438
 439
 440   /**
 441    * Client writes
 442    *
 443    * ECTransaction is responsible for generating a transaction for
 444    * each shard to which we need to send the write.  As required
 445    * by the PGBackend interface, the ECBackend write mechanism
 446    * passes trim information with the write and last_complete back
 447    * with the reply.
 448    *
 449    * As with client reads, there is a possibility of out-of-order
 450    * completions. Thus, callbacks and completion are called in order
 451    * on the writing list.
 452    */
 453   struct Op : boost::intrusive::list_base_hook<> {
 454     /// From submit_transaction caller, describes operation
 455     hobject_t hoid;
 456     object_stat_sum_t delta_stats;
 457     eversion_t version;
 458     eversion_t trim_to;
 459     boost::optional<pg_hit_set_history_t> updated_hit_set_history;
 460     vector<pg_log_entry_t> log_entries;
 461     ceph_tid_t tid;
 462     osd_reqid_t reqid;
 463     ZTracer::Trace trace;
 464
 465     eversion_t roll_forward_to; /// Soon to be generated internally
 466
 467     /// Ancillary also provided from submit_transaction caller
 468     map<hobject_t, ObjectContextRef> obc_map;
 469
 470     /// see call_write_ordered
 471     std::list<std::function<void(void)> > on_write;
 472
 473     /// Generated internally
 474     set<hobject_t> temp_added;
 475     set<hobject_t> temp_cleared;
 476
 477     ECTransaction::WritePlan plan;
 478     bool requires_rmw() const { return !plan.to_read.empty(); }
 479     bool invalidates_cache() const { return plan.invalidates_cache; }
 480
 481     // must be true if requires_rmw(), must be false if invalidates_cache()
 482     bool using_cache = true;
 483
 484     /// In progress read state;
 485     map<hobject_t,extent_set> pending_read; // subset already being read
 486     map<hobject_t,extent_set> remote_read;  // subset we must read
 487     map<hobject_t,extent_map> remote_read_result;
 488     bool read_in_progress() const {
 489       return !remote_read.empty() && remote_read_result.empty();
 490     }
 491
 492     /// In progress write state.
 493     set<pg_shard_t> pending_commit;
 494     // we need pending_apply for pre-mimic peers so that we don't issue a
 495     // read on a remote shard before it has applied a previous write.  We can
 496     // remove this after nautilus.
 497     set<pg_shard_t> pending_apply;
 498     bool write_in_progress() const {
 499       return !pending_commit.empty() || !pending_apply.empty();
 500     }
 501
 502     /// optional, may be null, for tracking purposes
 503     OpRequestRef client_op;
 504
 505     /// pin for cache
 506     ExtentCache::write_pin pin;
 507
 508     /// Callbacks
 509     Context *on_all_commit = nullptr;
 510     ~Op() {
 511       delete on_all_commit;
 512     }
 513   };
 514   using op_list = boost::intrusive::list<Op>;
 515   friend ostream &operator<<(ostream &lhs, const Op &rhs);
 516
 517   ExtentCache cache;
 518   map<ceph_tid_t, Op> tid_to_op_map; /// Owns Op structure
 519
 520   /**
 521    * We model the possible rmw states as a set of waitlists.
 522    * All writes at this time complete in order, so a write blocked
 523    * at waiting_state blocks all writes behind it as well (same for
 524    * other states).
 525    *
 526    * Future work: We can break this up into a per-object pipeline
 527    * (almost).  First, provide an ordering token to submit_transaction
 528    * and require that all operations within a single transaction take
 529    * place on a subset of hobject_t space partitioned by that token
 530    * (the hashid seem about right to me -- even works for temp objects
 531    * if you recall that a temp object created for object head foo will
 532    * only ever be referenced by other transactions on foo and aren't
 533    * reused).  Next, factor this part into a class and maintain one per
 534    * ordering token.  Next, fixup PrimaryLogPG's repop queue to be
 535    * partitioned by ordering token.  Finally, refactor the op pipeline
 536    * so that the log entries passed into submit_transaction aren't
 537    * versioned.  We can't assign versions to them until we actually
 538    * submit the operation.  That's probably going to be the hard part.
 539    */
 540   class pipeline_state_t {
 541     enum {
 542       CACHE_VALID = 0,
 543       CACHE_INVALID = 1
 544     } pipeline_state = CACHE_VALID;
 545   public:
 546     bool caching_enabled() const {
 547       return pipeline_state == CACHE_VALID;
 548     }
 549     bool cache_invalid() const {
 550       return !caching_enabled();
 551     }
 552     void invalidate() {
 553       pipeline_state = CACHE_INVALID;
 554     }
 555     void clear() {
 556       pipeline_state = CACHE_VALID;
 557     }
 558     friend ostream &operator<<(ostream &lhs, const pipeline_state_t &rhs);
 559   } pipeline_state;
 560
 561
 562   op_list waiting_state;        /// writes waiting on pipe_state
 563   op_list waiting_reads;        /// writes waiting on partial stripe reads
 564   op_list waiting_commit;       /// writes waiting on initial commit
 565   eversion_t completed_to;
 566   eversion_t committed_to;
 567   void start_rmw(Op *op, PGTransactionUPtr &&t);
 568   bool try_state_to_reads();
 569   bool try_reads_to_commit();
 570   bool try_finish_rmw();
 571   void check_ops();
 572
 573   ErasureCodeInterfaceRef ec_impl;
 574
 575
 576   /**
 577    * ECRecPred
 578    *
 579    * Determines the whether _have is sufficient to recover an object
 580    */
 581   class ECRecPred : public IsPGRecoverablePredicate {
 582     set<int> want;
 583     ErasureCodeInterfaceRef ec_impl;
 584   public:
 585     explicit ECRecPred(ErasureCodeInterfaceRef ec_impl) : ec_impl(ec_impl) {
 586       for (unsigned i = 0; i < ec_impl->get_chunk_count(); ++i) {
 587         want.insert(i);
 588       }
 589     }
 590     bool operator()(const set<pg_shard_t> &_have) const override {
 591       set<int> have;
 592       for (set<pg_shard_t>::const_iterator i = _have.begin();
 593            i != _have.end();
 594            ++i) {
 595         have.insert(i->shard);
 596       }
 597       map<int, vector<pair<int, int>>> min;
 598       return ec_impl->minimum_to_decode(want, have, &min) == 0;
 599     }
 600   };
 601   IsPGRecoverablePredicate *get_is_recoverable_predicate() const override {
 602     return new ECRecPred(ec_impl);
 603   }
 604
 605   int get_ec_data_chunk_count() const override {
 606     return ec_impl->get_data_chunk_count();
 607   }
 608   int get_ec_stripe_chunk_size() const override {
 609     return sinfo.get_chunk_size();
 610   }
 611
 612   /**
 613    * ECReadPred
 614    *
 615    * Determines the whether _have is sufficient to read an object
 616    */
 617   class ECReadPred : public IsPGReadablePredicate {
 618     pg_shard_t whoami;
 619     ECRecPred rec_pred;
 620   public:
 621     ECReadPred(
 622       pg_shard_t whoami,
 623       ErasureCodeInterfaceRef ec_impl) : whoami(whoami), rec_pred(ec_impl) {}
 624     bool operator()(const set<pg_shard_t> &_have) const override {
 625       return _have.count(whoami) && rec_pred(_have);
 626     }
 627   };
 628   IsPGReadablePredicate *get_is_readable_predicate() const override {
 629     return new ECReadPred(get_parent()->whoami_shard(), ec_impl);
 630   }
 631
 632
 633   const ECUtil::stripe_info_t sinfo;
 634   /// If modified, ensure that the ref is held until the update is applied
 635   SharedPtrRegistry<hobject_t, ECUtil::HashInfo> unstable_hashinfo_registry;
 636   ECUtil::HashInfoRef get_hash_info(const hobject_t &hoid, bool checks = true,
 637                                     const map<string,bufferptr> *attr = NULL);
 638
 639 public:
 640   ECBackend(
 641     PGBackend::Listener *pg,
 642     const coll_t &coll,
 643     ObjectStore::CollectionHandle &ch,
 644     ObjectStore *store,
 645     CephContext *cct,
 646     ErasureCodeInterfaceRef ec_impl,
 647     uint64_t stripe_width);
 648
 649   /// Returns to_read replicas sufficient to reconstruct want
 650   int get_min_avail_to_read_shards(
 651     const hobject_t &hoid,     ///< [in] object
 652     const set<int> &want,      ///< [in] desired shards
 653     bool for_recovery,         ///< [in] true if we may use non-acting replicas
 654     bool do_redundant_reads,   ///< [in] true if we want to issue redundant reads to reduce latency
 655     map<pg_shard_t, vector<pair<int, int>>> *to_read   ///< [out] shards, corresponding subchunks to read
 656     ); ///< @return error code, 0 on success
 657
 658   int get_remaining_shards(
 659     const hobject_t &hoid,
 660     const set<int> &avail,
 661     const set<int> &want,
 662     const read_result_t &result,
 663     map<pg_shard_t, vector<pair<int, int>>> *to_read,
 664     bool for_recovery);
 665
 666   int objects_get_attrs(
 667     const hobject_t &hoid,
 668     map<string, bufferlist> *out) override;
 669
 670   void rollback_append(
 671     const hobject_t &hoid,
 672     uint64_t old_size,
 673     ObjectStore::Transaction *t) override;
 674
 675   bool auto_repair_supported() const override { return true; }
 676
 677   int be_deep_scrub(
 678     const hobject_t &poid,
 679     ScrubMap &map,
 680     ScrubMapBuilder &pos,
 681     ScrubMap::object &o) override;
 682   uint64_t be_get_ondisk_size(uint64_t logical_size) override {
 683     return sinfo.logical_to_next_chunk_offset(logical_size);
 684   }
 685   void _failed_push(const hobject_t &hoid,
 686     pair<RecoveryMessages *, ECBackend::read_result_t &> &in);
 687 };
 688 ostream &operator<<(ostream &lhs, const ECBackend::pipeline_state_t &rhs);
 689
 690 #endif