]> git.proxmox.com Git - ceph.git/blob - ceph/src/erasure-code/ErasureCodeInterface.h
update sources to v12.1.1
[ceph.git] / ceph / src / erasure-code / ErasureCodeInterface.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph distributed storage system
5 *
6 * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
7 *
8 * Author: Loic Dachary <loic@dachary.org>
9 *
10 * This library is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 */
16
17 #ifndef CEPH_ERASURE_CODE_INTERFACE_H
18 #define CEPH_ERASURE_CODE_INTERFACE_H
19
20 /*! @file ErasureCodeInterface.h
21 @brief Interface provided by erasure code plugins
22
23 The erasure coded pools rely on plugins implementing
24 **ErasureCodeInterface** to encode and decode content. All codes
25 are systematic (i.e. the data is not mangled and can be
26 reconstructed by concatenating chunks ).
27
28 Methods returning an **int** return **0** on success and a
29 negative value on error. If the value returned on error is not
30 explained in **ErasureCodeInterface**, the sources or the
31 documentation of the interface implementer (i.e. the plugin ) must
32 be read to figure out what it means. It is recommended that each
33 error code matches an *errno* value that relates to the cause of
34 the error.
35
36 If an object is small enough, the caller can process it with
37 one call to the **encode** or **decode** method.
38
39 +---------------- coded object O -------------------------+
40 |+----------------+ +----------------+ +----------------+ |
41 || chunk 0 | | chunk 1 | | chunk 2 | |
42 || [0,N) | | [N,2N) | | [2N,3N) | |
43 |+----------------+ +----------------+ +----------------+ |
44 +------^--------------------------------------------------+
45 |
46 chunk B / C | offset B % C ( where C is the chunk size )
47 |
48 +-----^---- raw object O ----+------+
49 | B [0,X) | pad |
50 +----------------------------+------+
51
52 The object size is paded so that each chunks are of the same size.
53 In the example above, if the actual object size was X, then it
54 will be padded to 2N >= X assuming there are two data chunks (0
55 and 1) and one coding chunk (2).
56
57 For chunks of size C, byte B of the object is found in chunk number
58 B / C at offset B % C.
59
60 If an object is too large to be encoded in memory, the caller
61 should divide it in smaller units named **stripes**.
62
63 +---------------------- object O -------------------------+
64 |+----------------+ +----------------+ +----------------+ |
65 stripe || chunk 0 | | chunk 1 | | chunk 2 | |
66 0 || [0,N) | | [N,2N) | | [2N,3N) | |
67 |+----------------+ +----------------+ +----------------+ |
68 |+----------------+ +----------------+ +----------------+ |
69 stripe || chunk 0 | | chunk 1 | | chunk 2 | |
70 1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | |
71 || | | | | | |
72 |+----------------+ +----------------+ +----------------+ |
73 | ... |
74 +---------------------------------------------------------+
75
76 The interface does not concern itself with stripes nor does it
77 impose constraints on the size of each stripe. Variable names in
78 the interface always use **object** and never use **stripe**.
79
80 Assuming the interface implementer provides three data chunks ( K
81 = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as
82 follows:
83
84 ~~~~~~~~~~~~~~~~{.c}
85 set<int> want_to_encode(0, 1, 2, // data chunks
86 3, 4 // coding chunks
87 );
88 bufferlist in = "ABCDEF";
89 map<int, bufferlist> encoded
90 encode(want_to_encode, in, &encoded);
91 encoded[0] == "AB" // data chunk 0
92 encoded[1] == "CD" // data chunk 1
93 encoded[2] == "EF" // data chunk 2
94 encoded[3] // coding chunk 0
95 encoded[4] // coding chunk 1
96 ~~~~~~~~~~~~~~~~
97
98 The **minimum_to_decode_with_cost** method can be used to minimize
99 the cost of fetching the chunks necessary to retrieve a given
100 content. For instance, if encoded[2] (contained **EF**) is missing
101 and accessing encoded[3] (the first coding chunk) is more
102 expensive than accessing encoded[4] (the second coding chunk),
103 **minimum_to_decode_with_cost** is expected to chose the first
104 coding chunk.
105
106 ~~~~~~~~~~~~~~~~{.c}
107 set<int> want_to_read(2); // want the chunk containing "EF"
108 map<int,int> available(
109 0 => 1, // data chunk 0 : available and costs 1
110 1 => 1, // data chunk 1 : available and costs 1
111 // data chunk 2 : missing
112 3 => 9, // coding chunk 1 : available and costs 9
113 4 => 1, // coding chunk 2 : available and costs 1
114 );
115 set<int> minimum;
116 minimum_to_decode_with_cost(want_to_read,
117 available,
118 &minimum);
119 minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3);
120 ~~~~~~~~~~~~~~~~
121
122 It sets **minimum** with three chunks to reconstruct the desired
123 data chunk and will pick the second coding chunk ( 4 ) because it
124 is less expensive ( 1 < 9 ) to retrieve than the first coding
125 chunk ( 3 ). The caller is responsible for retrieving the chunks
126 and call **decode** to reconstruct the second data chunk.
127
128 ~~~~~~~~~~~~~~~~{.c}
129 map<int,bufferlist> chunks;
130 for i in minimum.keys():
131 chunks[i] = fetch_chunk(i); // get chunk from storage
132 map<int, bufferlist> decoded;
133 decode(want_to_read, chunks, &decoded);
134 decoded[2] == "EF"
135 ~~~~~~~~~~~~~~~~
136
137 The semantic of the cost value is defined by the caller and must
138 be known to the implementer. For instance, it may be more
139 expensive to retrieve two chunks with cost 1 + 9 = 10 than two
140 chunks with cost 6 + 6 = 12.
141 */
142
143 #include <map>
144 #include <set>
145 #include <vector>
146 #include <ostream>
147 #include <memory>
148 #include <string>
149 #include "include/buffer_fwd.h"
150
151 class CrushWrapper;
152
153 namespace ceph {
154
155 typedef std::map<std::string,std::string> ErasureCodeProfile;
156
157 inline std::ostream& operator<<(std::ostream& out, const ErasureCodeProfile& profile) {
158 out << "{";
159 for (ErasureCodeProfile::const_iterator it = profile.begin();
160 it != profile.end();
161 ++it) {
162 if (it != profile.begin()) out << ",";
163 out << it->first << "=" << it->second;
164 }
165 out << "}";
166 return out;
167 }
168
169
170 class ErasureCodeInterface {
171 public:
172 virtual ~ErasureCodeInterface() {}
173
174 /**
175 * Initialize the instance according to the content of
176 * **profile**. The **ss** stream is set with debug messages or
177 * error messages, the content of which depend on the
178 * implementation.
179 *
180 * Return 0 on success or a negative errno on error. When
181 * returning on error, the implementation is expected to
182 * provide a human readable explanation in **ss**.
183 *
184 * @param [in] profile a key/value map
185 * @param [out] ss contains informative messages when an error occurs
186 * @return 0 on success or a negative errno on error.
187 */
188 virtual int init(ErasureCodeProfile &profile, std::ostream *ss) = 0;
189
190 /**
191 * Return the profile that was used to initialize the instance
192 * with the **init** method.
193 *
194 * @return the profile in use by the instance
195 */
196 virtual const ErasureCodeProfile &get_profile() const = 0;
197
198 /**
199 * Create a new rule in **crush** under the name **name**,
200 * unless it already exists.
201 *
202 * Return the rule number that was created on success. If a
203 * rule **name** already exists, return -EEXISTS, otherwise
204 * return a negative value indicating an error with a semantic
205 * defined by the implementation.
206 *
207 * @param [in] name of the rule to create
208 * @param [in] crush crushmap in which the rule is created
209 * @param [out] ss contains informative messages when an error occurs
210 * @return a rule on success or a negative errno on error.
211 */
212 virtual int create_rule(const std::string &name,
213 CrushWrapper &crush,
214 std::ostream *ss) const = 0;
215
216 /**
217 * Return the number of chunks created by a call to the **encode**
218 * method.
219 *
220 * In the simplest case it can be K + M, i.e. the number
221 * of data chunks (K) plus the number of parity chunks
222 * (M). However, if the implementation provides local parity there
223 * could be an additional overhead.
224 *
225 * @return the number of chunks created by encode()
226 */
227 virtual unsigned int get_chunk_count() const = 0;
228
229 /**
230 * Return the number of data chunks created by a call to the
231 * **encode** method. The data chunks contain the buffer provided
232 * to **encode**, verbatim, with padding at the end of the last
233 * chunk.
234 *
235 * @return the number of data chunks created by encode()
236 */
237 virtual unsigned int get_data_chunk_count() const = 0;
238
239 /**
240 * Return the number of coding chunks created by a call to the
241 * **encode** method. The coding chunks are used to recover from
242 * the loss of one or more chunks. If there is one coding chunk,
243 * it is possible to recover from the loss of exactly one
244 * chunk. If there are two coding chunks, it is possible to
245 * recover from the loss of at most two chunks, etc.
246 *
247 * @return the number of coding chunks created by encode()
248 */
249 virtual unsigned int get_coding_chunk_count() const = 0;
250
251 /**
252 * Return the size (in bytes) of a single chunk created by a call
253 * to the **decode** method. The returned size multiplied by
254 * **get_chunk_count()** is greater or equal to **object_size**.
255 *
256 * If the object size is properly aligned, the chunk size is
257 * **object_size / get_chunk_count()**. However, if
258 * **object_size** is not a multiple of **get_chunk_count** or if
259 * the implementation imposes additional alignment constraints,
260 * the chunk size may be larger.
261 *
262 * The byte found at offset **B** of the original object is mapped
263 * to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**.
264 *
265 * @param [in] object_size the number of bytes of the object to **encode()**
266 * @return the size (in bytes) of a single chunk created by **encode()**
267 */
268 virtual unsigned int get_chunk_size(unsigned int object_size) const = 0;
269
270 /**
271 * Compute the smallest subset of **available** chunks that needs
272 * to be retrieved in order to successfully decode
273 * **want_to_read** chunks.
274 *
275 * It is strictly equivalent to calling
276 * **minimum_to_decode_with_cost** where each **available** chunk
277 * has the same cost.
278 *
279 * @see minimum_to_decode_with_cost
280 *
281 * @param [in] want_to_read chunk indexes to be decoded
282 * @param [in] available chunk indexes containing valid data
283 * @param [out] minimum chunk indexes to retrieve
284 * @return **0** on success or a negative errno on error.
285 */
286 virtual int minimum_to_decode(const std::set<int> &want_to_read,
287 const std::set<int> &available,
288 std::set<int> *minimum) = 0;
289
290 /**
291 * Compute the smallest subset of **available** chunks that needs
292 * to be retrieved in order to successfully decode
293 * **want_to_read** chunks. If there are more than one possible
294 * subset, select the subset that minimizes the overall retrieval
295 * cost.
296 *
297 * The **available** parameter maps chunk indexes to their
298 * retrieval cost. The higher the cost value, the more costly it
299 * is to retrieve the chunk content.
300 *
301 * Returns -EIO if there are not enough chunk indexes in
302 * **available** to decode **want_to_read**.
303 *
304 * Returns 0 on success.
305 *
306 * The **minimum** argument must be a pointer to an empty set.
307 *
308 * @param [in] want_to_read chunk indexes to be decoded
309 * @param [in] available map chunk indexes containing valid data
310 * to their retrieval cost
311 * @param [out] minimum chunk indexes to retrieve
312 * @return **0** on success or a negative errno on error.
313 */
314 virtual int minimum_to_decode_with_cost(const std::set<int> &want_to_read,
315 const std::map<int, int> &available,
316 std::set<int> *minimum) = 0;
317
318 /**
319 * Encode the content of **in** and store the result in
320 * **encoded**. All buffers pointed to by **encoded** have the
321 * same size. The **encoded** map contains at least all chunk
322 * indexes found in the **want_to_encode** set.
323 *
324 * The **encoded** map is expected to be a pointer to an empty
325 * map.
326 *
327 * Assuming the **in** parameter is **length** bytes long,
328 * the concatenation of the first **length** bytes of the
329 * **encoded** buffers is equal to the content of the **in**
330 * parameter.
331 *
332 * The **encoded** map may contain more chunks than required by
333 * **want_to_encode** and the caller is expected to permanently
334 * store all of them, not just the chunks listed in
335 * **want_to_encode**.
336 *
337 * The **encoded** map may contain pointers to data stored in
338 * the **in** parameter. If the caller modifies the content of
339 * **in** after calling the encode method, it may have a side
340 * effect on the content of **encoded**.
341 *
342 * The **encoded** map may contain pointers to buffers allocated
343 * by the encode method. They will be freed when **encoded** is
344 * freed. The allocation method is not specified.
345 *
346 * Returns 0 on success.
347 *
348 * @param [in] want_to_encode chunk indexes to be encoded
349 * @param [in] in data to be encoded
350 * @param [out] encoded map chunk indexes to chunk data
351 * @return **0** on success or a negative errno on error.
352 */
353 virtual int encode(const std::set<int> &want_to_encode,
354 const bufferlist &in,
355 std::map<int, bufferlist> *encoded) = 0;
356
357
358 virtual int encode_chunks(const std::set<int> &want_to_encode,
359 std::map<int, bufferlist> *encoded) = 0;
360
361 /**
362 * Decode the **chunks** and store at least **want_to_read**
363 * chunks in **decoded**.
364 *
365 * The **decoded** map must be a pointer to an empty map.
366 *
367 * There must be enough **chunks** ( as returned by
368 * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to
369 * perform a successful decoding of all chunks listed in
370 * **want_to_read**.
371 *
372 * All buffers pointed by **in** must have the same size.
373 *
374 * On success, the **decoded** map may contain more chunks than
375 * required by **want_to_read** and they can safely be used by the
376 * caller.
377 *
378 * If a chunk is listed in **want_to_read** and there is a
379 * corresponding **bufferlist** in **chunks**, it will be
380 * referenced in **decoded**. If not it will be reconstructed from
381 * the existing chunks.
382 *
383 * Because **decoded** may contain pointers to data found in
384 * **chunks**, modifying the content of **chunks** after calling
385 * decode may have a side effect on the content of **decoded**.
386 *
387 * Returns 0 on success.
388 *
389 * @param [in] want_to_read chunk indexes to be decoded
390 * @param [in] chunks map chunk indexes to chunk data
391 * @param [out] decoded map chunk indexes to chunk data
392 * @return **0** on success or a negative errno on error.
393 */
394 virtual int decode(const std::set<int> &want_to_read,
395 const std::map<int, bufferlist> &chunks,
396 std::map<int, bufferlist> *decoded) = 0;
397
398 virtual int decode_chunks(const std::set<int> &want_to_read,
399 const std::map<int, bufferlist> &chunks,
400 std::map<int, bufferlist> *decoded) = 0;
401
402 /**
403 * Return the ordered list of chunks or an empty vector
404 * if no remapping is necessary.
405 *
406 * By default encoding an object with K=2,M=1 will create three
407 * chunks, the first two are data and the last one coding. For
408 * a 10MB object, it would be:
409 *
410 * chunk 0 for the first 5MB
411 * chunk 1 for the last 5MB
412 * chunk 2 for the 5MB coding chunk
413 *
414 * The plugin may, however, decide to remap them in a different
415 * order, such as:
416 *
417 * chunk 0 for the last 5MB
418 * chunk 1 for the 5MB coding chunk
419 * chunk 2 for the first 5MB
420 *
421 * The vector<int> remaps the chunks so that the first chunks are
422 * data, in sequential order, and the last chunks contain parity
423 * in the same order as they were output by the encoding function.
424 *
425 * In the example above the mapping would be:
426 *
427 * [ 1, 2, 0 ]
428 *
429 * The returned vector<int> only contains information for chunks
430 * that need remapping. If no remapping is necessary, the
431 * vector<int> is empty.
432 *
433 * @return vector<int> list of indices of chunks to be remapped
434 */
435 virtual const std::vector<int> &get_chunk_mapping() const = 0;
436
437 /**
438 * Decode the first **get_data_chunk_count()** **chunks** and
439 * concatenate them into **decoded**.
440 *
441 * Returns 0 on success.
442 *
443 * @param [in] chunks map chunk indexes to chunk data
444 * @param [out] decoded concatenante of the data chunks
445 * @return **0** on success or a negative errno on error.
446 */
447 virtual int decode_concat(const std::map<int, bufferlist> &chunks,
448 bufferlist *decoded) = 0;
449 };
450
451 typedef std::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef;
452
453 }
454
455 #endif