]> git.proxmox.com Git - ceph.git/blame - ceph/src/erasure-code/ErasureCodeInterface.h
bump version to 12.0.3-pve3
[ceph.git] / ceph / src / erasure-code / ErasureCodeInterface.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph distributed storage system
5 *
6 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
7 *
8 * Author: Loic Dachary <loic@dachary.org>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 */
16
17#ifndef CEPH_ERASURE_CODE_INTERFACE_H
18#define CEPH_ERASURE_CODE_INTERFACE_H
19
20/*! @file ErasureCodeInterface.h
21 @brief Interface provided by erasure code plugins
22
23 The erasure coded pools rely on plugins implementing
24 **ErasureCodeInterface** to encode and decode content. All codes
25 are systematic (i.e. the data is not mangled and can be
26 reconstructed by concatenating chunks ).
27
28 Methods returning an **int** return **0** on success and a
29 negative value on error. If the value returned on error is not
30 explained in **ErasureCodeInterface**, the sources or the
31 documentation of the interface implementer (i.e. the plugin ) must
32 be read to figure out what it means. It is recommended that each
33 error code matches an *errno* value that relates to the cause of
34 the error.
35
36 If an object is small enough, the caller can process it with
37 one call to the **encode** or **decode** method.
38
39 +---------------- coded object O -------------------------+
40 |+----------------+ +----------------+ +----------------+ |
41 || chunk 0 | | chunk 1 | | chunk 2 | |
42 || [0,N) | | [N,2N) | | [2N,3N) | |
43 |+----------------+ +----------------+ +----------------+ |
44 +------^--------------------------------------------------+
45 |
46 chunk B / C | offset B % C ( where C is the chunk size )
47 |
48 +-----^---- raw object O ----+------+
49 | B [0,X) | pad |
50 +----------------------------+------+
51
52 The object size is paded so that each chunks are of the same size.
53 In the example above, if the actual object size was X, then it
54 will be padded to 2N >= X assuming there are two data chunks (0
55 and 1) and one coding chunk (2).
56
57 For chunks of size C, byte B of the object is found in chunk number
58 B / C at offset B % C.
59
60 If an object is too large to be encoded in memory, the caller
61 should divide it in smaller units named **stripes**.
62
63 +---------------------- object O -------------------------+
64 |+----------------+ +----------------+ +----------------+ |
65 stripe || chunk 0 | | chunk 1 | | chunk 2 | |
66 0 || [0,N) | | [N,2N) | | [2N,3N) | |
67 |+----------------+ +----------------+ +----------------+ |
68 |+----------------+ +----------------+ +----------------+ |
69 stripe || chunk 0 | | chunk 1 | | chunk 2 | |
70 1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | |
71 || | | | | | |
72 |+----------------+ +----------------+ +----------------+ |
73 | ... |
74 +---------------------------------------------------------+
75
76 The interface does not concern itself with stripes nor does it
77 impose constraints on the size of each stripe. Variable names in
78 the interface always use **object** and never use **stripe**.
79
80 Assuming the interface implementer provides three data chunks ( K
81 = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
82 follows:
83
84 ~~~~~~~~~~~~~~~~{.c}
85 set<int> want_to_encode(0, 1, 2, // data chunks
86 3, 4 // coding chunks
87 );
88 bufferlist in = "ABCDEF";
89 map<int, bufferlist> encoded
90 encode(want_to_encode, in, &encoded);
91 encoded[0] == "AB" // data chunk 0
92 encoded[1] == "CD" // data chunk 1
93 encoded[2] == "EF" // data chunk 2
94 encoded[3] // coding chunk 0
95 encoded[4] // coding chunk 1
96 ~~~~~~~~~~~~~~~~
97
98 The **minimum_to_decode_with_cost** method can be used to minimize
99 the cost of fetching the chunks necessary to retrieve a given
100 content. For instance, if encoded[2] (contained **EF**) is missing
101 and accessing encoded[3] (the first coding chunk) is more
102 expensive than accessing encoded[4] (the second coding chunk),
103 **minimum_to_decode_with_cost** is expected to chose the first
104 coding chunk.
105
106 ~~~~~~~~~~~~~~~~{.c}
107 set<int> want_to_read(2); // want the chunk containing "EF"
108 map<int,int> available(
109 0 => 1, // data chunk 0 : available and costs 1
110 1 => 1, // data chunk 1 : available and costs 1
111 // data chunk 2 : missing
112 3 => 9, // coding chunk 1 : available and costs 9
113 4 => 1, // coding chunk 2 : available and costs 1
114 );
115 set<int> minimum;
116 minimum_to_decode_with_cost(want_to_read,
117 available,
118 &minimum);
119 minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
120 ~~~~~~~~~~~~~~~~
121
122 It sets **minimum** with three chunks to reconstruct the desired
123 data chunk and will pick the second coding chunk ( 4 ) because it
124 is less expensive ( 1 < 9 ) to retrieve than the first coding
125 chunk ( 3 ). The caller is responsible for retrieving the chunks
126 and call **decode** to reconstruct the second data chunk.
127
128 ~~~~~~~~~~~~~~~~{.c}
129 map<int,bufferlist> chunks;
130 for i in minimum.keys():
131 chunks[i] = fetch_chunk(i); // get chunk from storage
132 map<int, bufferlist> decoded;
133 decode(want_to_read, chunks, &decoded);
134 decoded[2] == "EF"
135 ~~~~~~~~~~~~~~~~
136
137 The semantic of the cost value is defined by the caller and must
138 be known to the implementer. For instance, it may be more
139 expensive to retrieve two chunks with cost 1 + 9 = 10 than two
140 chunks with cost 6 + 6 = 12.
141 */
142
143#include <map>
144#include <set>
145#include <vector>
146#include <iostream>
147#include "include/memory.h"
148#include "include/buffer_fwd.h"
149
150class CrushWrapper;
151
152using namespace std;
153
154namespace ceph {
155
156 typedef map<std::string,std::string> ErasureCodeProfile;
157
158 inline ostream& operator<<(ostream& out, const ErasureCodeProfile& profile) {
159 out << "{";
160 for (ErasureCodeProfile::const_iterator it = profile.begin();
161 it != profile.end();
162 ++it) {
163 if (it != profile.begin()) out << ",";
164 out << it->first << "=" << it->second;
165 }
166 out << "}";
167 return out;
168 }
169
170
171 class ErasureCodeInterface {
172 public:
173 virtual ~ErasureCodeInterface() {}
174
175 /**
176 * Initialize the instance according to the content of
177 * **profile**. The **ss** stream is set with debug messages or
178 * error messages, the content of which depend on the
179 * implementation.
180 *
181 * Return 0 on success or a negative errno on error. When
182 * returning on error, the implementation is expected to
183 * provide a human readable explanation in **ss**.
184 *
185 * @param [in] profile a key/value map
186 * @param [out] ss contains informative messages when an error occurs
187 * @return 0 on success or a negative errno on error.
188 */
189 virtual int init(ErasureCodeProfile &profile, ostream *ss) = 0;
190
191 /**
192 * Return the profile that was used to initialize the instance
193 * with the **init** method.
194 *
195 * @return the profile in use by the instance
196 */
197 virtual const ErasureCodeProfile &get_profile() const = 0;
198
199 /**
200 * Create a new ruleset in **crush** under the name **name**,
201 * unless it already exists.
202 *
203 * Return the ruleset number that was created on success. If a
204 * ruleset **name** already exists, return -EEXISTS, otherwise
205 * return a negative value indicating an error with a semantic
206 * defined by the implementation.
207 *
208 * @param [in] name of the ruleset to create
209 * @param [in] crush crushmap in which the ruleset is created
210 * @param [out] ss contains informative messages when an error occurs
211 * @return a ruleset on success or a negative errno on error.
212 */
213 virtual int create_ruleset(const string &name,
214 CrushWrapper &crush,
215 ostream *ss) const = 0;
216
217 /**
218 * Return the number of chunks created by a call to the **encode**
219 * method.
220 *
221 * In the simplest case it can be K + M, i.e. the number
222 * of data chunks (K) plus the number of parity chunks
223 * (M). However, if the implementation provides local parity there
224 * could be an additional overhead.
225 *
226 * @return the number of chunks created by encode()
227 */
228 virtual unsigned int get_chunk_count() const = 0;
229
230 /**
231 * Return the number of data chunks created by a call to the
232 * **encode** method. The data chunks contain the buffer provided
233 * to **encode**, verbatim, with padding at the end of the last
234 * chunk.
235 *
236 * @return the number of data chunks created by encode()
237 */
238 virtual unsigned int get_data_chunk_count() const = 0;
239
240 /**
241 * Return the number of coding chunks created by a call to the
242 * **encode** method. The coding chunks are used to recover from
243 * the loss of one or more chunks. If there is one coding chunk,
244 * it is possible to recover from the loss of exactly one
245 * chunk. If there are two coding chunks, it is possible to
246 * recover from the loss of at most two chunks, etc.
247 *
248 * @return the number of coding chunks created by encode()
249 */
250 virtual unsigned int get_coding_chunk_count() const = 0;
251
252 /**
253 * Return the size (in bytes) of a single chunk created by a call
254 * to the **decode** method. The returned size multiplied by
255 * **get_chunk_count()** is greater or equal to **object_size**.
256 *
257 * If the object size is properly aligned, the chunk size is
258 * **object_size / get_chunk_count()**. However, if
259 * **object_size** is not a multiple of **get_chunk_count** or if
260 * the implementation imposes additional alignment constraints,
261 * the chunk size may be larger.
262 *
263 * The byte found at offset **B** of the original object is mapped
264 * to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**.
265 *
266 * @param [in] object_size the number of bytes of the object to **encode()**
267 * @return the size (in bytes) of a single chunk created by **encode()**
268 */
269 virtual unsigned int get_chunk_size(unsigned int object_size) const = 0;
270
271 /**
272 * Compute the smallest subset of **available** chunks that needs
273 * to be retrieved in order to successfully decode
274 * **want_to_read** chunks.
275 *
276 * It is strictly equivalent to calling
277 * **minimum_to_decode_with_cost** where each **available** chunk
278 * has the same cost.
279 *
280 * @see minimum_to_decode_with_cost
281 *
282 * @param [in] want_to_read chunk indexes to be decoded
283 * @param [in] available chunk indexes containing valid data
284 * @param [out] minimum chunk indexes to retrieve
285 * @return **0** on success or a negative errno on error.
286 */
287 virtual int minimum_to_decode(const set<int> &want_to_read,
288 const set<int> &available,
289 set<int> *minimum) = 0;
290
291 /**
292 * Compute the smallest subset of **available** chunks that needs
293 * to be retrieved in order to successfully decode
294 * **want_to_read** chunks. If there are more than one possible
295 * subset, select the subset that minimizes the overall retrieval
296 * cost.
297 *
298 * The **available** parameter maps chunk indexes to their
299 * retrieval cost. The higher the cost value, the more costly it
300 * is to retrieve the chunk content.
301 *
302 * Returns -EIO if there are not enough chunk indexes in
303 * **available** to decode **want_to_read**.
304 *
305 * Returns 0 on success.
306 *
307 * The **minimum** argument must be a pointer to an empty set.
308 *
309 * @param [in] want_to_read chunk indexes to be decoded
310 * @param [in] available map chunk indexes containing valid data
311 * to their retrieval cost
312 * @param [out] minimum chunk indexes to retrieve
313 * @return **0** on success or a negative errno on error.
314 */
315 virtual int minimum_to_decode_with_cost(const set<int> &want_to_read,
316 const map<int, int> &available,
317 set<int> *minimum) = 0;
318
319 /**
320 * Encode the content of **in** and store the result in
321 * **encoded**. All buffers pointed to by **encoded** have the
322 * same size. The **encoded** map contains at least all chunk
323 * indexes found in the **want_to_encode** set.
324 *
325 * The **encoded** map is expected to be a pointer to an empty
326 * map.
327 *
328 * Assuming the **in** parameter is **length** bytes long,
329 * the concatenation of the first **length** bytes of the
330 * **encoded** buffers is equal to the content of the **in**
331 * parameter.
332 *
333 * The **encoded** map may contain more chunks than required by
334 * **want_to_encode** and the caller is expected to permanently
335 * store all of them, not just the chunks listed in
336 * **want_to_encode**.
337 *
338 * The **encoded** map may contain pointers to data stored in
339 * the **in** parameter. If the caller modifies the content of
340 * **in** after calling the encode method, it may have a side
341 * effect on the content of **encoded**.
342 *
343 * The **encoded** map may contain pointers to buffers allocated
344 * by the encode method. They will be freed when **encoded** is
345 * freed. The allocation method is not specified.
346 *
347 * Returns 0 on success.
348 *
349 * @param [in] want_to_encode chunk indexes to be encoded
350 * @param [in] in data to be encoded
351 * @param [out] encoded map chunk indexes to chunk data
352 * @return **0** on success or a negative errno on error.
353 */
354 virtual int encode(const set<int> &want_to_encode,
355 const bufferlist &in,
356 map<int, bufferlist> *encoded) = 0;
357
358
359 virtual int encode_chunks(const set<int> &want_to_encode,
360 map<int, bufferlist> *encoded) = 0;
361
362 /**
363 * Decode the **chunks** and store at least **want_to_read**
364 * chunks in **decoded**.
365 *
366 * The **decoded** map must be a pointer to an empty map.
367 *
368 * There must be enough **chunks** ( as returned by
369 * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
370 * perform a successful decoding of all chunks listed in
371 * **want_to_read**.
372 *
373 * All buffers pointed by **in** must have the same size.
374 *
375 * On success, the **decoded** map may contain more chunks than
376 * required by **want_to_read** and they can safely be used by the
377 * caller.
378 *
379 * If a chunk is listed in **want_to_read** and there is a
380 * corresponding **bufferlist** in **chunks**, it will be
381 * referenced in **decoded**. If not it will be reconstructed from
382 * the existing chunks.
383 *
384 * Because **decoded** may contain pointers to data found in
385 * **chunks**, modifying the content of **chunks** after calling
386 * decode may have a side effect on the content of **decoded**.
387 *
388 * Returns 0 on success.
389 *
390 * @param [in] want_to_read chunk indexes to be decoded
391 * @param [in] chunks map chunk indexes to chunk data
392 * @param [out] decoded map chunk indexes to chunk data
393 * @return **0** on success or a negative errno on error.
394 */
395 virtual int decode(const set<int> &want_to_read,
396 const map<int, bufferlist> &chunks,
397 map<int, bufferlist> *decoded) = 0;
398
399 virtual int decode_chunks(const set<int> &want_to_read,
400 const map<int, bufferlist> &chunks,
401 map<int, bufferlist> *decoded) = 0;
402
403 /**
404 * Return the ordered list of chunks or an empty vector
405 * if no remapping is necessary.
406 *
407 * By default encoding an object with K=2,M=1 will create three
408 * chunks, the first two are data and the last one coding. For
409 * a 10MB object, it would be:
410 *
411 * chunk 0 for the first 5MB
412 * chunk 1 for the last 5MB
413 * chunk 2 for the 5MB coding chunk
414 *
415 * The plugin may, however, decide to remap them in a different
416 * order, such as:
417 *
418 * chunk 0 for the last 5MB
419 * chunk 1 for the 5MB coding chunk
420 * chunk 2 for the first 5MB
421 *
422 * The vector<int> remaps the chunks so that the first chunks are
423 * data, in sequential order, and the last chunks contain parity
424 * in the same order as they were output by the encoding function.
425 *
426 * In the example above the mapping would be:
427 *
428 * [ 1, 2, 0 ]
429 *
430 * The returned vector<int> only contains information for chunks
431 * that need remapping. If no remapping is necessary, the
432 * vector<int> is empty.
433 *
434 * @return vector<int> list of indices of chunks to be remapped
435 */
436 virtual const vector<int> &get_chunk_mapping() const = 0;
437
438 /**
439 * Decode the first **get_data_chunk_count()** **chunks** and
440 * concatenate them into **decoded**.
441 *
442 * Returns 0 on success.
443 *
444 * @param [in] chunks map chunk indexes to chunk data
445 * @param [out] decoded concatenante of the data chunks
446 * @return **0** on success or a negative errno on error.
447 */
448 virtual int decode_concat(const map<int, bufferlist> &chunks,
449 bufferlist *decoded) = 0;
450 };
451
452 typedef ceph::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
453
454}
455
456#endif