ceph/src/erasure-code/ErasureCodeInterface.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph distributed storage system
   5  *
   6  * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
   7  *
   8  * Author: Loic Dachary <loic@dachary.org>
   9  *
  10  *  This library is free software; you can redistribute it and/or
  11  *  modify it under the terms of the GNU Lesser General Public
  12  *  License as published by the Free Software Foundation; either
  13  *  version 2.1 of the License, or (at your option) any later version.
  14  *
  15  */
  16
  17 #ifndef CEPH_ERASURE_CODE_INTERFACE_H
  18 #define CEPH_ERASURE_CODE_INTERFACE_H
  19
  20 /*! @file ErasureCodeInterface.h
  21     @brief Interface provided by erasure code plugins
  22
  23     The erasure coded pools rely on plugins implementing
  24     **ErasureCodeInterface** to encode and decode content. All codes
  25     are systematic (i.e. the data is not mangled and can be
  26     reconstructed by concatenating chunks ).
  27
  28     Methods returning an **int** return **0** on success and a
  29     negative value on error. If the value returned on error is not
  30     explained in **ErasureCodeInterface**, the sources or the
  31     documentation of the interface implementer (i.e. the plugin ) must
  32     be read to figure out what it means. It is recommended that each
  33     error code matches an *errno* value that relates to the cause of
  34     the error.
  35
  36     If an object is small enough, the caller can process it with
  37     one call to the **encode** or **decode** method.
  38
  39         +---------------- coded object O -------------------------+
  40         |+----------------+ +----------------+ +----------------+ |
  41         ||    chunk  0    | |    chunk  1    | |    chunk  2    | |
  42         ||    [0,N)       | |    [N,2N)      | |    [2N,3N)     | |
  43         |+----------------+ +----------------+ +----------------+ |
  44         +------^--------------------------------------------------+
  45                |
  46    chunk B / C | offset B % C ( where C is the chunk size )
  47                |
  48          +-----^---- raw object O ----+------+
  49          |     B     [0,X)            | pad  |
  50          +----------------------------+------+
  51
  52     The object size is paded so that each chunks are of the same size.
  53     In the example above, if the actual object size was X, then it
  54     will be padded to 2N >= X assuming there are two data chunks (0
  55     and 1) and one coding chunk (2).
  56
  57     For chunks of size C, byte B of the object is found in chunk number
  58     B / C at offset B % C.
  59
  60     If an object is too large to be encoded in memory, the caller
  61     should divide it in smaller units named **stripes**.
  62
  63         +---------------------- object O -------------------------+
  64         |+----------------+ +----------------+ +----------------+ |
  65  stripe ||    chunk  0    | |    chunk  1    | |    chunk  2    | |
  66    0    ||    [0,N)       | |    [N,2N)      | |    [2N,3N)     | |
  67         |+----------------+ +----------------+ +----------------+ |
  68         |+----------------+ +----------------+ +----------------+ |
  69  stripe ||    chunk  0    | |    chunk  1    | |    chunk  2    | |
  70    1    ||    [X,M)       | |   [X+M,X+2M)   | |   [X+2M,X+3M)  | |
  71         ||                | |                | |                | |
  72         |+----------------+ +----------------+ +----------------+ |
  73         |                         ...                             |
  74         +---------------------------------------------------------+
  75
  76     The interface does not concern itself with stripes nor does it
  77     impose constraints on the size of each stripe. Variable names in
  78     the interface always use **object** and never use **stripe**.
  79
  80     Assuming the interface implementer provides three data chunks ( K
  81     = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
  82     follows:
  83
  84     ~~~~~~~~~~~~~~~~{.c}
  85     set<int> want_to_encode(0, 1, 2, // data chunks
  86                             3, 4     // coding chunks
  87                            );
  88     bufferlist in = "ABCDEF";
  89     map<int, bufferlist> encoded
  90     encode(want_to_encode, in, &encoded);
  91     encoded[0] == "AB" // data chunk 0
  92     encoded[1] == "CD" // data chunk 1
  93     encoded[2] == "EF" // data chunk 2
  94     encoded[3]         // coding chunk 0
  95     encoded[4]         // coding chunk 1
  96     ~~~~~~~~~~~~~~~~
  97
  98     The **minimum_to_decode_with_cost** method can be used to minimize
  99     the cost of fetching the chunks necessary to retrieve a given
 100     content. For instance, if encoded[2] (contained **EF**) is missing
 101     and accessing encoded[3] (the first coding chunk) is more
 102     expensive than accessing encoded[4] (the second coding chunk),
 103     **minimum_to_decode_with_cost** is expected to chose the first
 104     coding chunk.
 105
 106     ~~~~~~~~~~~~~~~~{.c}
 107     set<int> want_to_read(2); // want the chunk containing "EF"
 108     map<int,int> available(
 109           0 => 1,  // data chunk 0 : available and costs 1
 110           1 => 1,  // data chunk 1 : available and costs 1
 111                    // data chunk 2 : missing
 112           3 => 9,  // coding chunk 1 : available and costs 9
 113           4 => 1,  // coding chunk 2 : available and costs 1
 114     );
 115     set<int> minimum;
 116     minimum_to_decode_with_cost(want_to_read,
 117                                 available,
 118                                 &minimum);
 119     minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
 120     ~~~~~~~~~~~~~~~~
 121
 122     It sets **minimum** with three chunks to reconstruct the desired
 123     data chunk and will pick the second coding chunk ( 4 ) because it
 124     is less expensive ( 1 < 9 ) to retrieve than the first coding
 125     chunk ( 3 ). The caller is responsible for retrieving the chunks
 126     and call **decode** to reconstruct the second data chunk.
 127
 128     ~~~~~~~~~~~~~~~~{.c}
 129     map<int,bufferlist> chunks;
 130     for i in minimum.keys():
 131       chunks[i] = fetch_chunk(i); // get chunk from storage
 132     map<int, bufferlist> decoded;
 133     decode(want_to_read, chunks, &decoded);
 134     decoded[2] == "EF"
 135     ~~~~~~~~~~~~~~~~
 136
 137     The semantic of the cost value is defined by the caller and must
 138     be known to the implementer. For instance, it may be more
 139     expensive to retrieve two chunks with cost 1 + 9 = 10 than two
 140     chunks with cost 6 + 6 = 12.
 141  */
 142
 143 #include <map>
 144 #include <set>
 145 #include <vector>
 146 #include <ostream>
 147 #include <memory>
 148 #include <string>
 149 #include "include/buffer_fwd.h"
 150
 151 class CrushWrapper;
 152
 153 namespace ceph {
 154
 155   typedef std::map<std::string,std::string> ErasureCodeProfile;
 156
 157   inline std::ostream& operator<<(std::ostream& out, const ErasureCodeProfile& profile) {
 158     out << "{";
 159     for (ErasureCodeProfile::const_iterator it = profile.begin();
 160          it != profile.end();
 161          ++it) {
 162       if (it != profile.begin()) out << ",";
 163       out << it->first << "=" << it->second;
 164     }
 165     out << "}";
 166     return out;
 167   }
 168
 169
 170   class ErasureCodeInterface {
 171   public:
 172     virtual ~ErasureCodeInterface() {}
 173
 174     /**
 175      * Initialize the instance according to the content of
 176      * **profile**. The **ss** stream is set with debug messages or
 177      * error messages, the content of which depend on the
 178      * implementation.
 179      *
 180      * Return 0 on success or a negative errno on error. When
 181      * returning on error, the implementation is expected to
 182      * provide a human readable explanation in **ss**.
 183      *
 184      * @param [in] profile a key/value map
 185      * @param [out] ss contains informative messages when an error occurs
 186      * @return 0 on success or a negative errno on error.
 187      */
 188     virtual int init(ErasureCodeProfile &profile, std::ostream *ss) = 0;
 189
 190     /**
 191      * Return the profile that was used to initialize the instance
 192      * with the **init** method.
 193      *
 194      * @return the profile in use by the instance
 195      */
 196     virtual const ErasureCodeProfile &get_profile() const = 0;
 197
 198     /**
 199      * Create a new rule in **crush** under the name **name**,
 200      * unless it already exists.
 201      *
 202      * Return the rule number that was created on success. If a
 203      * rule **name** already exists, return -EEXISTS, otherwise
 204      * return a negative value indicating an error with a semantic
 205      * defined by the implementation.
 206      *
 207      * @param [in] name of the rule to create
 208      * @param [in] crush crushmap in which the rule is created
 209      * @param [out] ss contains informative messages when an error occurs
 210      * @return a rule on success or a negative errno on error.
 211      */
 212     virtual int create_rule(const std::string &name,
 213                             CrushWrapper &crush,
 214                             std::ostream *ss) const = 0;
 215
 216     /**
 217      * Return the number of chunks created by a call to the **encode**
 218      * method.
 219      *
 220      * In the simplest case it can be K + M, i.e. the number
 221      * of data chunks (K) plus the number of parity chunks
 222      * (M). However, if the implementation provides local parity there
 223      * could be an additional overhead.
 224      *
 225      * @return the number of chunks created by encode()
 226      */
 227     virtual unsigned int get_chunk_count() const = 0;
 228
 229     /**
 230      * Return the number of data chunks created by a call to the
 231      * **encode** method. The data chunks contain the buffer provided
 232      * to **encode**, verbatim, with padding at the end of the last
 233      * chunk.
 234      *
 235      * @return the number of data chunks created by encode()
 236      */
 237     virtual unsigned int get_data_chunk_count() const = 0;
 238
 239     /**
 240      * Return the number of coding chunks created by a call to the
 241      * **encode** method. The coding chunks are used to recover from
 242      * the loss of one or more chunks. If there is one coding chunk,
 243      * it is possible to recover from the loss of exactly one
 244      * chunk. If there are two coding chunks, it is possible to
 245      * recover from the loss of at most two chunks, etc.
 246      *
 247      * @return the number of coding chunks created by encode()
 248      */
 249     virtual unsigned int get_coding_chunk_count() const = 0;
 250
 251     /**
 252      * Return the size (in bytes) of a single chunk created by a call
 253      * to the **decode** method. The returned size multiplied by
 254      * **get_chunk_count()** is greater or equal to **object_size**.
 255      *
 256      * If the object size is properly aligned, the chunk size is
 257      * **object_size / get_chunk_count()**. However, if
 258      * **object_size** is not a multiple of **get_chunk_count** or if
 259      * the implementation imposes additional alignment constraints,
 260      * the chunk size may be larger.
 261      *
 262      * The byte found at offset **B** of the original object is mapped
 263      * to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**.
 264      *
 265      * @param [in] object_size the number of bytes of the object to **encode()**
 266      * @return the size (in bytes) of a single chunk created by **encode()**
 267      */
 268     virtual unsigned int get_chunk_size(unsigned int object_size) const = 0;
 269
 270     /**
 271      * Compute the smallest subset of **available** chunks that needs
 272      * to be retrieved in order to successfully decode
 273      * **want_to_read** chunks.
 274      *
 275      * It is strictly equivalent to calling
 276      * **minimum_to_decode_with_cost** where each **available** chunk
 277      * has the same cost.
 278      *
 279      * @see minimum_to_decode_with_cost
 280      *
 281      * @param [in] want_to_read chunk indexes to be decoded
 282      * @param [in] available chunk indexes containing valid data
 283      * @param [out] minimum chunk indexes to retrieve
 284      * @return **0** on success or a negative errno on error.
 285      */
 286     virtual int minimum_to_decode(const std::set<int> &want_to_read,
 287                                   const std::set<int> &available,
 288                                   std::set<int> *minimum) = 0;
 289
 290     /**
 291      * Compute the smallest subset of **available** chunks that needs
 292      * to be retrieved in order to successfully decode
 293      * **want_to_read** chunks. If there are more than one possible
 294      * subset, select the subset that minimizes the overall retrieval
 295      * cost.
 296      *
 297      * The **available** parameter maps chunk indexes to their
 298      * retrieval cost. The higher the cost value, the more costly it
 299      * is to retrieve the chunk content.
 300      *
 301      * Returns -EIO if there are not enough chunk indexes in
 302      * **available** to decode **want_to_read**.
 303      *
 304      * Returns 0 on success.
 305      *
 306      * The **minimum** argument must be a pointer to an empty set.
 307      *
 308      * @param [in] want_to_read chunk indexes to be decoded
 309      * @param [in] available map chunk indexes containing valid data
 310      *             to their retrieval cost
 311      * @param [out] minimum chunk indexes to retrieve
 312      * @return **0** on success or a negative errno on error.
 313      */
 314     virtual int minimum_to_decode_with_cost(const std::set<int> &want_to_read,
 315                                             const std::map<int, int> &available,
 316                                             std::set<int> *minimum) = 0;
 317
 318     /**
 319      * Encode the content of **in** and store the result in
 320      * **encoded**. All buffers pointed to by **encoded** have the
 321      * same size. The **encoded** map contains at least all chunk
 322      * indexes found in the **want_to_encode** set.
 323      *
 324      * The **encoded** map is expected to be a pointer to an empty
 325      * map.
 326      *
 327      * Assuming the **in** parameter is **length** bytes long,
 328      * the concatenation of the first **length** bytes of the
 329      * **encoded** buffers is equal to the content of the **in**
 330      * parameter.
 331      *
 332      * The **encoded** map may contain more chunks than required by
 333      * **want_to_encode** and the caller is expected to permanently
 334      * store all of them, not just the chunks listed in
 335      * **want_to_encode**.
 336      *
 337      * The **encoded** map may contain pointers to data stored in
 338      * the **in** parameter. If the caller modifies the content of
 339      * **in** after calling the encode method, it may have a side
 340      * effect on the content of **encoded**.
 341      *
 342      * The **encoded** map may contain pointers to buffers allocated
 343      * by the encode method. They will be freed when **encoded** is
 344      * freed. The allocation method is not specified.
 345      *
 346      * Returns 0 on success.
 347      *
 348      * @param [in] want_to_encode chunk indexes to be encoded
 349      * @param [in] in data to be encoded
 350      * @param [out] encoded map chunk indexes to chunk data
 351      * @return **0** on success or a negative errno on error.
 352      */
 353     virtual int encode(const std::set<int> &want_to_encode,
 354                        const bufferlist &in,
 355                        std::map<int, bufferlist> *encoded) = 0;
 356
 357
 358     virtual int encode_chunks(const std::set<int> &want_to_encode,
 359                               std::map<int, bufferlist> *encoded) = 0;
 360
 361     /**
 362      * Decode the **chunks** and store at least **want_to_read**
 363      * chunks in **decoded**.
 364      *
 365      * The **decoded** map must be a pointer to an empty map.
 366      *
 367      * There must be enough **chunks** ( as returned by
 368      * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
 369      * perform a successful decoding of all chunks listed in
 370      * **want_to_read**.
 371      *
 372      * All buffers pointed by **in** must have the same size.
 373      *
 374      * On success, the **decoded** map may contain more chunks than
 375      * required by **want_to_read** and they can safely be used by the
 376      * caller.
 377      *
 378      * If a chunk is listed in **want_to_read** and there is a
 379      * corresponding **bufferlist** in **chunks**, it will be
 380      * referenced in **decoded**. If not it will be reconstructed from
 381      * the existing chunks.
 382      *
 383      * Because **decoded** may contain pointers to data found in
 384      * **chunks**, modifying the content of **chunks** after calling
 385      * decode may have a side effect on the content of **decoded**.
 386      *
 387      * Returns 0 on success.
 388      *
 389      * @param [in] want_to_read chunk indexes to be decoded
 390      * @param [in] chunks map chunk indexes to chunk data
 391      * @param [out] decoded map chunk indexes to chunk data
 392      * @return **0** on success or a negative errno on error.
 393      */
 394     virtual int decode(const std::set<int> &want_to_read,
 395                        const std::map<int, bufferlist> &chunks,
 396                        std::map<int, bufferlist> *decoded) = 0;
 397
 398     virtual int decode_chunks(const std::set<int> &want_to_read,
 399                               const std::map<int, bufferlist> &chunks,
 400                               std::map<int, bufferlist> *decoded) = 0;
 401
 402     /**
 403      * Return the ordered list of chunks or an empty vector
 404      * if no remapping is necessary.
 405      *
 406      * By default encoding an object with K=2,M=1 will create three
 407      * chunks, the first two are data and the last one coding. For
 408      * a 10MB object, it would be:
 409      *
 410      *   chunk 0 for the first 5MB
 411      *   chunk 1 for the last 5MB
 412      *   chunk 2 for the 5MB coding chunk
 413      *
 414      * The plugin may, however, decide to remap them in a different
 415      * order, such as:
 416      *
 417      *   chunk 0 for the last 5MB
 418      *   chunk 1 for the 5MB coding chunk
 419      *   chunk 2 for the first 5MB
 420      *
 421      * The vector<int> remaps the chunks so that the first chunks are
 422      * data, in sequential order, and the last chunks contain parity
 423      * in the same order as they were output by the encoding function.
 424      *
 425      * In the example above the mapping would be:
 426      *
 427      *   [ 1, 2, 0 ]
 428      *
 429      * The returned vector<int> only contains information for chunks
 430      * that need remapping. If no remapping is necessary, the
 431      * vector<int> is empty.
 432      *
 433      * @return vector<int> list of indices of chunks to be remapped
 434      */
 435     virtual const std::vector<int> &get_chunk_mapping() const = 0;
 436
 437     /**
 438      * Decode the first **get_data_chunk_count()** **chunks** and
 439      * concatenate them into **decoded**.
 440      *
 441      * Returns 0 on success.
 442      *
 443      * @param [in] chunks map chunk indexes to chunk data
 444      * @param [out] decoded concatenante of the data chunks
 445      * @return **0** on success or a negative errno on error.
 446      */
 447     virtual int decode_concat(const std::map<int, bufferlist> &chunks,
 448                               bufferlist *decoded) = 0;
 449   };
 450
 451   typedef std::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
 452
 453 }
 454
 455 #endif