]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph distributed storage system | |
5 | * | |
6 | * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com> | |
7 | * | |
8 | * Author: Loic Dachary <loic@dachary.org> | |
9 | * | |
10 | * This library is free software; you can redistribute it and/or | |
11 | * modify it under the terms of the GNU Lesser General Public | |
12 | * License as published by the Free Software Foundation; either | |
13 | * version 2.1 of the License, or (at your option) any later version. | |
14 | * | |
15 | */ | |
16 | ||
17 | #ifndef CEPH_ERASURE_CODE_INTERFACE_H | |
18 | #define CEPH_ERASURE_CODE_INTERFACE_H | |
19 | ||
20 | /*! @file ErasureCodeInterface.h | |
21 | @brief Interface provided by erasure code plugins | |
22 | ||
23 | The erasure coded pools rely on plugins implementing | |
24 | **ErasureCodeInterface** to encode and decode content. All codes | |
25 | are systematic (i.e. the data is not mangled and can be | |
26 | reconstructed by concatenating chunks ). | |
27 | ||
28 | Methods returning an **int** return **0** on success and a | |
29 | negative value on error. If the value returned on error is not | |
30 | explained in **ErasureCodeInterface**, the sources or the | |
31 | documentation of the interface implementer (i.e. the plugin ) must | |
32 | be read to figure out what it means. It is recommended that each | |
33 | error code matches an *errno* value that relates to the cause of | |
34 | the error. | |
35 | ||
36 | If an object is small enough, the caller can process it with | |
37 | one call to the **encode** or **decode** method. | |
38 | ||
39 | +---------------- coded object O -------------------------+ | |
40 | |+----------------+ +----------------+ +----------------+ | | |
41 | || chunk 0 | | chunk 1 | | chunk 2 | | | |
42 | || [0,N) | | [N,2N) | | [2N,3N) | | | |
43 | |+----------------+ +----------------+ +----------------+ | | |
44 | +------^--------------------------------------------------+ | |
45 | | | |
46 | chunk B / C | offset B % C ( where C is the chunk size ) | |
47 | | | |
48 | +-----^---- raw object O ----+------+ | |
49 | | B [0,X) | pad | | |
50 | +----------------------------+------+ | |
51 | ||
52 | The object size is paded so that each chunks are of the same size. | |
53 | In the example above, if the actual object size was X, then it | |
54 | will be padded to 2N >= X assuming there are two data chunks (0 | |
55 | and 1) and one coding chunk (2). | |
56 | ||
57 | For chunks of size C, byte B of the object is found in chunk number | |
58 | B / C at offset B % C. | |
59 | ||
60 | If an object is too large to be encoded in memory, the caller | |
61 | should divide it in smaller units named **stripes**. | |
62 | ||
63 | +---------------------- object O -------------------------+ | |
64 | |+----------------+ +----------------+ +----------------+ | | |
65 | stripe || chunk 0 | | chunk 1 | | chunk 2 | | | |
66 | 0 || [0,N) | | [N,2N) | | [2N,3N) | | | |
67 | |+----------------+ +----------------+ +----------------+ | | |
68 | |+----------------+ +----------------+ +----------------+ | | |
69 | stripe || chunk 0 | | chunk 1 | | chunk 2 | | | |
70 | 1 || [X,M) | | [X+M,X+2M) | | [X+2M,X+3M) | | | |
71 | || | | | | | | | |
72 | |+----------------+ +----------------+ +----------------+ | | |
73 | | ... | | |
74 | +---------------------------------------------------------+ | |
75 | ||
76 | The interface does not concern itself with stripes nor does it | |
77 | impose constraints on the size of each stripe. Variable names in | |
78 | the interface always use **object** and never use **stripe**. | |
79 | ||
80 | Assuming the interface implementer provides three data chunks ( K | |
81 | = 3 ) and two coding chunks ( M = 2 ), a buffer could be encoded as | |
82 | follows: | |
83 | ||
84 | ~~~~~~~~~~~~~~~~{.c} | |
85 | set<int> want_to_encode(0, 1, 2, // data chunks | |
86 | 3, 4 // coding chunks | |
87 | ); | |
88 | bufferlist in = "ABCDEF"; | |
89 | map<int, bufferlist> encoded | |
90 | encode(want_to_encode, in, &encoded); | |
91 | encoded[0] == "AB" // data chunk 0 | |
92 | encoded[1] == "CD" // data chunk 1 | |
93 | encoded[2] == "EF" // data chunk 2 | |
94 | encoded[3] // coding chunk 0 | |
95 | encoded[4] // coding chunk 1 | |
96 | ~~~~~~~~~~~~~~~~ | |
97 | ||
98 | The **minimum_to_decode_with_cost** method can be used to minimize | |
99 | the cost of fetching the chunks necessary to retrieve a given | |
100 | content. For instance, if encoded[2] (contained **EF**) is missing | |
101 | and accessing encoded[3] (the first coding chunk) is more | |
102 | expensive than accessing encoded[4] (the second coding chunk), | |
103 | **minimum_to_decode_with_cost** is expected to chose the first | |
104 | coding chunk. | |
105 | ||
106 | ~~~~~~~~~~~~~~~~{.c} | |
107 | set<int> want_to_read(2); // want the chunk containing "EF" | |
108 | map<int,int> available( | |
109 | 0 => 1, // data chunk 0 : available and costs 1 | |
110 | 1 => 1, // data chunk 1 : available and costs 1 | |
111 | // data chunk 2 : missing | |
112 | 3 => 9, // coding chunk 1 : available and costs 9 | |
113 | 4 => 1, // coding chunk 2 : available and costs 1 | |
114 | ); | |
115 | set<int> minimum; | |
116 | minimum_to_decode_with_cost(want_to_read, | |
117 | available, | |
118 | &minimum); | |
119 | minimum == set<int>(0, 1, 4); // NOT set<int>(0, 1, 3); | |
120 | ~~~~~~~~~~~~~~~~ | |
121 | ||
122 | It sets **minimum** with three chunks to reconstruct the desired | |
123 | data chunk and will pick the second coding chunk ( 4 ) because it | |
124 | is less expensive ( 1 < 9 ) to retrieve than the first coding | |
125 | chunk ( 3 ). The caller is responsible for retrieving the chunks | |
126 | and call **decode** to reconstruct the second data chunk. | |
127 | ||
128 | ~~~~~~~~~~~~~~~~{.c} | |
129 | map<int,bufferlist> chunks; | |
130 | for i in minimum.keys(): | |
131 | chunks[i] = fetch_chunk(i); // get chunk from storage | |
132 | map<int, bufferlist> decoded; | |
133 | decode(want_to_read, chunks, &decoded); | |
134 | decoded[2] == "EF" | |
135 | ~~~~~~~~~~~~~~~~ | |
136 | ||
137 | The semantic of the cost value is defined by the caller and must | |
138 | be known to the implementer. For instance, it may be more | |
139 | expensive to retrieve two chunks with cost 1 + 9 = 10 than two | |
140 | chunks with cost 6 + 6 = 12. | |
141 | */ | |
142 | ||
143 | #include <map> | |
144 | #include <set> | |
145 | #include <vector> | |
146 | #include <iostream> | |
147 | #include "include/memory.h" | |
148 | #include "include/buffer_fwd.h" | |
149 | ||
150 | class CrushWrapper; | |
151 | ||
152 | using namespace std; | |
153 | ||
154 | namespace ceph { | |
155 | ||
156 | typedef map<std::string,std::string> ErasureCodeProfile; | |
157 | ||
158 | inline ostream& operator<<(ostream& out, const ErasureCodeProfile& profile) { | |
159 | out << "{"; | |
160 | for (ErasureCodeProfile::const_iterator it = profile.begin(); | |
161 | it != profile.end(); | |
162 | ++it) { | |
163 | if (it != profile.begin()) out << ","; | |
164 | out << it->first << "=" << it->second; | |
165 | } | |
166 | out << "}"; | |
167 | return out; | |
168 | } | |
169 | ||
170 | ||
171 | class ErasureCodeInterface { | |
172 | public: | |
173 | virtual ~ErasureCodeInterface() {} | |
174 | ||
175 | /** | |
176 | * Initialize the instance according to the content of | |
177 | * **profile**. The **ss** stream is set with debug messages or | |
178 | * error messages, the content of which depend on the | |
179 | * implementation. | |
180 | * | |
181 | * Return 0 on success or a negative errno on error. When | |
182 | * returning on error, the implementation is expected to | |
183 | * provide a human readable explanation in **ss**. | |
184 | * | |
185 | * @param [in] profile a key/value map | |
186 | * @param [out] ss contains informative messages when an error occurs | |
187 | * @return 0 on success or a negative errno on error. | |
188 | */ | |
189 | virtual int init(ErasureCodeProfile &profile, ostream *ss) = 0; | |
190 | ||
191 | /** | |
192 | * Return the profile that was used to initialize the instance | |
193 | * with the **init** method. | |
194 | * | |
195 | * @return the profile in use by the instance | |
196 | */ | |
197 | virtual const ErasureCodeProfile &get_profile() const = 0; | |
198 | ||
199 | /** | |
200 | * Create a new ruleset in **crush** under the name **name**, | |
201 | * unless it already exists. | |
202 | * | |
203 | * Return the ruleset number that was created on success. If a | |
204 | * ruleset **name** already exists, return -EEXISTS, otherwise | |
205 | * return a negative value indicating an error with a semantic | |
206 | * defined by the implementation. | |
207 | * | |
208 | * @param [in] name of the ruleset to create | |
209 | * @param [in] crush crushmap in which the ruleset is created | |
210 | * @param [out] ss contains informative messages when an error occurs | |
211 | * @return a ruleset on success or a negative errno on error. | |
212 | */ | |
213 | virtual int create_ruleset(const string &name, | |
214 | CrushWrapper &crush, | |
215 | ostream *ss) const = 0; | |
216 | ||
217 | /** | |
218 | * Return the number of chunks created by a call to the **encode** | |
219 | * method. | |
220 | * | |
221 | * In the simplest case it can be K + M, i.e. the number | |
222 | * of data chunks (K) plus the number of parity chunks | |
223 | * (M). However, if the implementation provides local parity there | |
224 | * could be an additional overhead. | |
225 | * | |
226 | * @return the number of chunks created by encode() | |
227 | */ | |
228 | virtual unsigned int get_chunk_count() const = 0; | |
229 | ||
230 | /** | |
231 | * Return the number of data chunks created by a call to the | |
232 | * **encode** method. The data chunks contain the buffer provided | |
233 | * to **encode**, verbatim, with padding at the end of the last | |
234 | * chunk. | |
235 | * | |
236 | * @return the number of data chunks created by encode() | |
237 | */ | |
238 | virtual unsigned int get_data_chunk_count() const = 0; | |
239 | ||
240 | /** | |
241 | * Return the number of coding chunks created by a call to the | |
242 | * **encode** method. The coding chunks are used to recover from | |
243 | * the loss of one or more chunks. If there is one coding chunk, | |
244 | * it is possible to recover from the loss of exactly one | |
245 | * chunk. If there are two coding chunks, it is possible to | |
246 | * recover from the loss of at most two chunks, etc. | |
247 | * | |
248 | * @return the number of coding chunks created by encode() | |
249 | */ | |
250 | virtual unsigned int get_coding_chunk_count() const = 0; | |
251 | ||
252 | /** | |
253 | * Return the size (in bytes) of a single chunk created by a call | |
254 | * to the **decode** method. The returned size multiplied by | |
255 | * **get_chunk_count()** is greater or equal to **object_size**. | |
256 | * | |
257 | * If the object size is properly aligned, the chunk size is | |
258 | * **object_size / get_chunk_count()**. However, if | |
259 | * **object_size** is not a multiple of **get_chunk_count** or if | |
260 | * the implementation imposes additional alignment constraints, | |
261 | * the chunk size may be larger. | |
262 | * | |
263 | * The byte found at offset **B** of the original object is mapped | |
264 | * to chunk **B / get_chunk_size()** at offset **B % get_chunk_size()**. | |
265 | * | |
266 | * @param [in] object_size the number of bytes of the object to **encode()** | |
267 | * @return the size (in bytes) of a single chunk created by **encode()** | |
268 | */ | |
269 | virtual unsigned int get_chunk_size(unsigned int object_size) const = 0; | |
270 | ||
271 | /** | |
272 | * Compute the smallest subset of **available** chunks that needs | |
273 | * to be retrieved in order to successfully decode | |
274 | * **want_to_read** chunks. | |
275 | * | |
276 | * It is strictly equivalent to calling | |
277 | * **minimum_to_decode_with_cost** where each **available** chunk | |
278 | * has the same cost. | |
279 | * | |
280 | * @see minimum_to_decode_with_cost | |
281 | * | |
282 | * @param [in] want_to_read chunk indexes to be decoded | |
283 | * @param [in] available chunk indexes containing valid data | |
284 | * @param [out] minimum chunk indexes to retrieve | |
285 | * @return **0** on success or a negative errno on error. | |
286 | */ | |
287 | virtual int minimum_to_decode(const set<int> &want_to_read, | |
288 | const set<int> &available, | |
289 | set<int> *minimum) = 0; | |
290 | ||
291 | /** | |
292 | * Compute the smallest subset of **available** chunks that needs | |
293 | * to be retrieved in order to successfully decode | |
294 | * **want_to_read** chunks. If there are more than one possible | |
295 | * subset, select the subset that minimizes the overall retrieval | |
296 | * cost. | |
297 | * | |
298 | * The **available** parameter maps chunk indexes to their | |
299 | * retrieval cost. The higher the cost value, the more costly it | |
300 | * is to retrieve the chunk content. | |
301 | * | |
302 | * Returns -EIO if there are not enough chunk indexes in | |
303 | * **available** to decode **want_to_read**. | |
304 | * | |
305 | * Returns 0 on success. | |
306 | * | |
307 | * The **minimum** argument must be a pointer to an empty set. | |
308 | * | |
309 | * @param [in] want_to_read chunk indexes to be decoded | |
310 | * @param [in] available map chunk indexes containing valid data | |
311 | * to their retrieval cost | |
312 | * @param [out] minimum chunk indexes to retrieve | |
313 | * @return **0** on success or a negative errno on error. | |
314 | */ | |
315 | virtual int minimum_to_decode_with_cost(const set<int> &want_to_read, | |
316 | const map<int, int> &available, | |
317 | set<int> *minimum) = 0; | |
318 | ||
319 | /** | |
320 | * Encode the content of **in** and store the result in | |
321 | * **encoded**. All buffers pointed to by **encoded** have the | |
322 | * same size. The **encoded** map contains at least all chunk | |
323 | * indexes found in the **want_to_encode** set. | |
324 | * | |
325 | * The **encoded** map is expected to be a pointer to an empty | |
326 | * map. | |
327 | * | |
328 | * Assuming the **in** parameter is **length** bytes long, | |
329 | * the concatenation of the first **length** bytes of the | |
330 | * **encoded** buffers is equal to the content of the **in** | |
331 | * parameter. | |
332 | * | |
333 | * The **encoded** map may contain more chunks than required by | |
334 | * **want_to_encode** and the caller is expected to permanently | |
335 | * store all of them, not just the chunks listed in | |
336 | * **want_to_encode**. | |
337 | * | |
338 | * The **encoded** map may contain pointers to data stored in | |
339 | * the **in** parameter. If the caller modifies the content of | |
340 | * **in** after calling the encode method, it may have a side | |
341 | * effect on the content of **encoded**. | |
342 | * | |
343 | * The **encoded** map may contain pointers to buffers allocated | |
344 | * by the encode method. They will be freed when **encoded** is | |
345 | * freed. The allocation method is not specified. | |
346 | * | |
347 | * Returns 0 on success. | |
348 | * | |
349 | * @param [in] want_to_encode chunk indexes to be encoded | |
350 | * @param [in] in data to be encoded | |
351 | * @param [out] encoded map chunk indexes to chunk data | |
352 | * @return **0** on success or a negative errno on error. | |
353 | */ | |
354 | virtual int encode(const set<int> &want_to_encode, | |
355 | const bufferlist &in, | |
356 | map<int, bufferlist> *encoded) = 0; | |
357 | ||
358 | ||
359 | virtual int encode_chunks(const set<int> &want_to_encode, | |
360 | map<int, bufferlist> *encoded) = 0; | |
361 | ||
362 | /** | |
363 | * Decode the **chunks** and store at least **want_to_read** | |
364 | * chunks in **decoded**. | |
365 | * | |
366 | * The **decoded** map must be a pointer to an empty map. | |
367 | * | |
368 | * There must be enough **chunks** ( as returned by | |
369 | * **minimum_to_decode** or **minimum_to_decode_with_cost** ) to | |
370 | * perform a successful decoding of all chunks listed in | |
371 | * **want_to_read**. | |
372 | * | |
373 | * All buffers pointed by **in** must have the same size. | |
374 | * | |
375 | * On success, the **decoded** map may contain more chunks than | |
376 | * required by **want_to_read** and they can safely be used by the | |
377 | * caller. | |
378 | * | |
379 | * If a chunk is listed in **want_to_read** and there is a | |
380 | * corresponding **bufferlist** in **chunks**, it will be | |
381 | * referenced in **decoded**. If not it will be reconstructed from | |
382 | * the existing chunks. | |
383 | * | |
384 | * Because **decoded** may contain pointers to data found in | |
385 | * **chunks**, modifying the content of **chunks** after calling | |
386 | * decode may have a side effect on the content of **decoded**. | |
387 | * | |
388 | * Returns 0 on success. | |
389 | * | |
390 | * @param [in] want_to_read chunk indexes to be decoded | |
391 | * @param [in] chunks map chunk indexes to chunk data | |
392 | * @param [out] decoded map chunk indexes to chunk data | |
393 | * @return **0** on success or a negative errno on error. | |
394 | */ | |
395 | virtual int decode(const set<int> &want_to_read, | |
396 | const map<int, bufferlist> &chunks, | |
397 | map<int, bufferlist> *decoded) = 0; | |
398 | ||
399 | virtual int decode_chunks(const set<int> &want_to_read, | |
400 | const map<int, bufferlist> &chunks, | |
401 | map<int, bufferlist> *decoded) = 0; | |
402 | ||
403 | /** | |
404 | * Return the ordered list of chunks or an empty vector | |
405 | * if no remapping is necessary. | |
406 | * | |
407 | * By default encoding an object with K=2,M=1 will create three | |
408 | * chunks, the first two are data and the last one coding. For | |
409 | * a 10MB object, it would be: | |
410 | * | |
411 | * chunk 0 for the first 5MB | |
412 | * chunk 1 for the last 5MB | |
413 | * chunk 2 for the 5MB coding chunk | |
414 | * | |
415 | * The plugin may, however, decide to remap them in a different | |
416 | * order, such as: | |
417 | * | |
418 | * chunk 0 for the last 5MB | |
419 | * chunk 1 for the 5MB coding chunk | |
420 | * chunk 2 for the first 5MB | |
421 | * | |
422 | * The vector<int> remaps the chunks so that the first chunks are | |
423 | * data, in sequential order, and the last chunks contain parity | |
424 | * in the same order as they were output by the encoding function. | |
425 | * | |
426 | * In the example above the mapping would be: | |
427 | * | |
428 | * [ 1, 2, 0 ] | |
429 | * | |
430 | * The returned vector<int> only contains information for chunks | |
431 | * that need remapping. If no remapping is necessary, the | |
432 | * vector<int> is empty. | |
433 | * | |
434 | * @return vector<int> list of indices of chunks to be remapped | |
435 | */ | |
436 | virtual const vector<int> &get_chunk_mapping() const = 0; | |
437 | ||
438 | /** | |
439 | * Decode the first **get_data_chunk_count()** **chunks** and | |
440 | * concatenate them into **decoded**. | |
441 | * | |
442 | * Returns 0 on success. | |
443 | * | |
444 | * @param [in] chunks map chunk indexes to chunk data | |
445 | * @param [out] decoded concatenante of the data chunks | |
446 | * @return **0** on success or a negative errno on error. | |
447 | */ | |
448 | virtual int decode_concat(const map<int, bufferlist> &chunks, | |
449 | bufferlist *decoded) = 0; | |
450 | }; | |
451 | ||
452 | typedef ceph::shared_ptr<ErasureCodeInterface> ErasureCodeInterfaceRef; | |
453 | ||
454 | } | |
455 | ||
456 | #endif |