]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/crush.h
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / crush / crush.h
1 #ifndef CEPH_CRUSH_CRUSH_H
2 #define CEPH_CRUSH_CRUSH_H
3
4 #ifdef __KERNEL__
5 # include <linux/types.h>
6 #else
7 # include "crush_compat.h"
8 #endif
9
10 /*
11 * CRUSH is a pseudo-random data distribution algorithm that
12 * efficiently distributes input values (typically, data objects)
13 * across a heterogeneous, structured storage cluster.
14 *
15 * The algorithm was originally described in detail in this paper
16 * (although the algorithm has evolved somewhat since then):
17 *
18 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
19 *
20 * LGPL-2.1 or LGPL-3.0
21 */
22
23
24 #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
25
26 #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
27 #define CRUSH_MAX_RULES (1<<8) /* max crush rule id */
28
29 #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
30 #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
31
32 #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
33 /** @ingroup API
34 * The equivalent of NULL for an item, i.e. the absence of an item.
35 */
36 #define CRUSH_ITEM_NONE 0x7fffffff
37
38 /*
39 * CRUSH uses user-defined "rules" to describe how inputs should be
40 * mapped to devices. A rule consists of sequence of steps to perform
41 * to generate the set of output devices.
42 */
43 struct crush_rule_step {
44 __u32 op;
45 __s32 arg1;
46 __s32 arg2;
47 };
48
49 /** @ingroup API
50 */
51 enum crush_opcodes {
52 /*! do nothing
53 */
54 CRUSH_RULE_NOOP = 0,
55 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
56 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
57 /* arg2 = type */
58 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
59 CRUSH_RULE_EMIT = 4, /* no args */
60 CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
61 CRUSH_RULE_CHOOSELEAF_INDEP = 7,
62
63 CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
64 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
65 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
66 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
67 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
68 CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
69 };
70
71 /*
72 * for specifying choose num (arg1) relative to the max parameter
73 * passed to do_rule
74 */
75 #define CRUSH_CHOOSE_N 0
76 #define CRUSH_CHOOSE_N_MINUS(x) (-(x))
77
78 struct crush_rule {
79 __u32 len;
80 __u8 __unused_was_rule_mask_ruleset;
81 __u8 type;
82 __u8 deprecated_min_size;
83 __u8 deprecated_max_size;
84 struct crush_rule_step steps[0];
85 };
86
87 #define crush_rule_size(len) (sizeof(struct crush_rule) + \
88 (len)*sizeof(struct crush_rule_step))
89
90
91
92 /*
93 * A bucket is a named container of other items (either devices or
94 * other buckets).
95 */
96
97 /** @ingroup API
98 *
99 * Items within a bucket are chosen with crush_do_rule() using one of
100 * three algorithms representing a tradeoff between performance and
101 * reorganization efficiency. If you are unsure of which bucket type
102 * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
103 *
104 * The table summarizes how the speed of each option measures up
105 * against mapping stability when items are added or removed.
106 *
107 * Bucket Alg Speed Additions Removals
108 * ------------------------------------------------
109 * uniform O(1) poor poor
110 * list O(n) optimal poor
111 * straw2 O(n) optimal optimal
112 */
113 enum crush_algorithm {
114 /*!
115 * Devices are rarely added individually in a large system.
116 * Instead, new storage is typically deployed in blocks of identical
117 * devices, often as an additional shelf in a server rack or perhaps
118 * an entire cabinet. Devices reaching their end of life are often
119 * similarly decommissioned as a set (individual failures aside),
120 * making it natural to treat them as a unit. CRUSH uniform buckets
121 * are used to represent an identical set of devices in such
122 * circumstances. The key advantage in doing so is performance
123 * related: CRUSH can map replicas into uniform buckets in constant
124 * time. In cases where the uniformity restrictions are not
125 * appropriate, other bucket types can be used. If the size of a
126 * uniform bucket changes, there is a complete reshuffling of data
127 * between devices, much like conventional hash-based distribution
128 * strategies.
129 */
130 CRUSH_BUCKET_UNIFORM = 1,
131 /*!
132 * List buckets structure their contents as a linked list, and
133 * can contain items with arbitrary weights. To place a
134 * replica, CRUSH begins at the head of the list with the most
135 * recently added item and compares its weight to the sum of
136 * all remaining items' weights. Depending on the value of
137 * hash( x , r , item), either the current item is chosen with
138 * the appropriate probability, or the process continues
139 * recursively down the list. This is a natural and intuitive
140 * choice for an expanding cluster: either an object is
141 * relocated to the newest device with some appropriate
142 * probability, or it remains on the older devices as before.
143 * The result is optimal data migration when items are added
144 * to the bucket. Items removed from the middle or tail of the
145 * list, however, can result in a significant amount of
146 * unnecessary movement, making list buckets most suitable for
147 * circumstances in which they never (or very rarely) shrink.
148 */
149 CRUSH_BUCKET_LIST = 2,
150 /*! @cond INTERNAL */
151 CRUSH_BUCKET_TREE = 3,
152 CRUSH_BUCKET_STRAW = 4,
153 /*! @endcond */
154 /*!
155 * List and tree buckets are structured such that a limited
156 * number of hash values need to be calculated and compared to
157 * weights in order to select a bucket item. In doing so,
158 * they divide and conquer in a way that either gives certain
159 * items precedence (e. g., those at the beginning of a list)
160 * or obviates the need to consider entire subtrees of items
161 * at all. That improves the performance of the replica
162 * placement process, but can also introduce suboptimal
163 * reorganization behavior when the contents of a bucket
164 * change due an addition, removal, or re-weighting of an
165 * item.
166 *
167 * The straw2 bucket type allows all items to fairly "compete"
168 * against each other for replica placement through a process
169 * analogous to a draw of straws. To place a replica, a straw
170 * of random length is drawn for each item in the bucket. The
171 * item with the longest straw wins. The length of each straw
172 * is initially a value in a fixed range. Each straw length
173 * is scaled by a factor based on the item's weight so that
174 * heavily weighted items are more likely to win the draw.
175 * Although this process is almost twice as slow (on average)
176 * than a list bucket and even slower than a tree bucket
177 * (which scales logarithmically), straw2 buckets result in
178 * optimal data movement between nested items when modified.
179 */
180 CRUSH_BUCKET_STRAW2 = 5,
181 };
182 extern const char *crush_bucket_alg_name(int alg);
183
184 /*
185 * although tree was a legacy algorithm, it has been buggy, so
186 * exclude it.
187 */
188 #define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
189 (1 << CRUSH_BUCKET_UNIFORM) | \
190 (1 << CRUSH_BUCKET_LIST) | \
191 (1 << CRUSH_BUCKET_STRAW))
192
193 /** @ingroup API
194 *
195 * A bucket contains __size__ __items__ which are either positive
196 * numbers or negative numbers that reference other buckets and is
197 * uniquely identified with __id__ which is a negative number. The
198 * __weight__ of a bucket is the cumulative weight of all its
199 * children. A bucket is assigned a ::crush_algorithm that is used by
200 * crush_do_rule() to draw an item depending on its weight. A bucket
201 * can be assigned a strictly positive (> 0) __type__ defined by the
202 * caller. The __type__ can be used by crush_do_rule(), when it is
203 * given as an argument of a rule step.
204 *
205 * A pointer to crush_bucket can safely be cast into the following
206 * structure, depending on the value of __alg__:
207 *
208 * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
209 * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
210 * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
211 *
212 * The weight of each item depends on the algorithm and the
213 * information about it is available in the corresponding structure
214 * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
215 *
216 * See crush_map for more information on how __id__ is used
217 * to reference the bucket.
218 */
219 struct crush_bucket {
220 __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */
221 __u16 type; /*!< > 0 bucket type, defined by the caller */
222 __u8 alg; /*!< the item selection ::crush_algorithm */
223 /*! @cond INTERNAL */
224 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
225 /*! @endcond */
226 __u32 weight; /*!< 16.16 fixed point cumulated children weight */
227 __u32 size; /*!< size of the __items__ array */
228 __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */
229 };
230
231 /** @ingroup API
232 *
233 * Replacement weights for each item in a bucket. The size of the
234 * array must be exactly the size of the straw2 bucket, just as the
235 * item_weights array.
236 *
237 */
238 struct crush_weight_set {
239 __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
240 __u32 size; /*!< size of the __weights__ array */
241 };
242
243 /** @ingroup API
244 *
245 * Replacement weights and ids for a given straw2 bucket, for
246 * placement purposes.
247 *
248 * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
249 * replacement weights found at __weight_set[N]__ are used instead of
250 * the weights from __item_weights__. If __N__ is greater than
251 * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
252 * used instead. For instance if __weight_set__ is:
253 *
254 * [ [ 0x10000, 0x20000 ], // position 0
255 * [ 0x20000, 0x40000 ] ] // position 1
256 *
257 * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
258 * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
259 * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
260 * etc.
261 *
262 */
263 struct crush_choose_arg {
264 __s32 *ids; /*!< values to use instead of items */
265 __u32 ids_size; /*!< size of the __ids__ array */
266 struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
267 __u32 weight_set_positions; /*!< size of the __weight_set__ array */
268 };
269
270 /** @ingroup API
271 *
272 * Replacement weights and ids for each bucket in the crushmap. The
273 * __size__ of the __args__ array must be exactly the same as the
274 * __map->max_buckets__.
275 *
276 * The __crush_choose_arg__ at index N will be used when choosing
277 * an item from the bucket __map->buckets[N]__ bucket, provided it
278 * is a straw2 bucket.
279 *
280 */
281 struct crush_choose_arg_map {
282 struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
283 __u32 size; /*!< size of the __args__ array */
284 };
285
286 /** @ingroup API
287 * The weight of each item in the bucket when
288 * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
289 */
290 struct crush_bucket_uniform {
291 struct crush_bucket h; /*!< generic bucket information */
292 __u32 item_weight; /*!< 16.16 fixed point weight for each item */
293 };
294
295 /** @ingroup API
296 * The weight of each item in the bucket when
297 * __h.alg__ == ::CRUSH_BUCKET_LIST.
298 *
299 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
300 * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
301 * for j in [0,i[.
302 *
303 */
304 struct crush_bucket_list {
305 struct crush_bucket h; /*!< generic bucket information */
306 __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
307 __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */
308 };
309
310 struct crush_bucket_tree {
311 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
312 actual items */
313 __u8 num_nodes;
314 __u32 *node_weights;
315 };
316
317 struct crush_bucket_straw {
318 struct crush_bucket h;
319 __u32 *item_weights; /* 16-bit fixed point */
320 __u32 *straws; /* 16-bit fixed point */
321 };
322
323 /** @ingroup API
324 * The weight of each item in the bucket when
325 * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
326 *
327 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
328 * [0,__h.size__].
329 */
330 struct crush_bucket_straw2 {
331 struct crush_bucket h; /*!< generic bucket information */
332 __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
333 };
334
335
336
337 /** @ingroup API
338 *
339 * A crush map define a hierarchy of crush_bucket that end with leaves
340 * (buckets and leaves are called items) and a set of crush_rule to
341 * map an integer to items with the crush_do_rule() function.
342 *
343 */
344 struct crush_map {
345 /*! An array of crush_bucket pointers of size __max_buckets__.
346 * An element of the array may be NULL if the bucket was removed with
347 * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
348 * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
349 */
350 struct crush_bucket **buckets;
351 /*! An array of crush_rule pointers of size __max_rules__.
352 * An element of the array may be NULL if the rule was removed (there is
353 * no API to do so but there may be one in the future). The rules must be added
354 * with crush_add_rule().
355 */
356 struct crush_rule **rules;
357 __s32 max_buckets; /*!< the size of __buckets__ */
358 __u32 max_rules; /*!< the size of __rules__ */
359 /*! The value of the highest item stored in the crush_map + 1
360 */
361 __s32 max_devices;
362
363 /*! Backward compatibility tunable. It implements a bad solution
364 * and must always be set to 0 except for backward compatibility
365 * purposes
366 */
367 __u32 choose_local_tries;
368 /*! Backward compatibility tunable. It implements a bad solution
369 * and must always be set to 0 except for backward compatibility
370 * purposes
371 */
372 __u32 choose_local_fallback_tries;
373 /*! Tunable. The default value when the CHOOSE_TRIES or
374 * CHOOSELEAF_TRIES steps are omitted in a rule. See the
375 * documentation for crush_rule_set_step() for more
376 * information
377 */
378 __u32 choose_total_tries;
379 /*! Backward compatibility tunable. It should always be set
380 * to 1 except for backward compatibility. Implemented in 2012
381 * it was generalized late 2013 and is mostly unused except
382 * in one border case, reason why it must be set to 1.
383 *
384 * Attempt chooseleaf inner descent once for firstn mode; on
385 * reject retry outer descent. Note that this does *not*
386 * apply to a collision: in that case we will retry as we
387 * used to.
388 */
389 __u32 chooseleaf_descend_once;
390 /*! Backward compatibility tunable. It is a fix for bad
391 * mappings implemented in 2014 at
392 * https://github.com/ceph/ceph/pull/1185. It should always
393 * be set to 1 except for backward compatibility.
394 *
395 * If non-zero, feed r into chooseleaf, bit-shifted right by
396 * (r-1) bits. a value of 1 is best for new clusters. for
397 * legacy clusters that want to limit reshuffling, a value of
398 * 3 or 4 will make the mappings line up a bit better with
399 * previous mappings.
400 */
401 __u8 chooseleaf_vary_r;
402
403 /*! Backward compatibility tunable. It is an improvement that
404 * avoids unnecessary mapping changes, implemented at
405 * https://github.com/ceph/ceph/pull/6572 and explained in
406 * this post: "chooseleaf may cause some unnecessary pg
407 * migrations" in October 2015
408 * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
409 * It should always be set to 1 except for backward compatibility.
410 */
411 __u8 chooseleaf_stable;
412
413 /*! @cond INTERNAL */
414 /* This value is calculated after decode or construction by
415 the builder. It is exposed here (rather than having a
416 'build CRUSH working space' function) so that callers can
417 reserve a static buffer, allocate space on the stack, or
418 otherwise avoid calling into the heap allocator if they
419 want to. The size of the working space depends on the map,
420 while the size of the scratch vector passed to the mapper
421 depends on the size of the desired result set.
422
423 Nothing stops the caller from allocating both in one swell
424 foop and passing in two points, though. */
425 size_t working_size;
426
427 #ifndef __KERNEL__
428 /*! @endcond */
429 /*! Backward compatibility tunable. It is a fix for the straw
430 * scaler values for the straw algorithm which is deprecated
431 * (straw2 replaces it) implemented at
432 * https://github.com/ceph/ceph/pull/3057. It should always
433 * be set to 1 except for backward compatibility.
434 *
435 */
436 __u8 straw_calc_version;
437
438 /*! @cond INTERNAL */
439 /*
440 * allowed bucket algs is a bitmask, here the bit positions
441 * are CRUSH_BUCKET_*. note that these are *bits* and
442 * CRUSH_BUCKET_* values are not, so we need to or together (1
443 * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
444 * minimize confusion (bucket type values start at 1).
445 */
446 __u32 allowed_bucket_algs;
447
448 __u32 *choose_tries;
449 #endif
450 /*! @endcond */
451 };
452
453
454 /* crush.c */
455 /** @ingroup API
456 *
457 * Return the 16.16 fixed point weight of the item at __pos__ (zero
458 * based index) within the bucket __b__. If __pos__ is negative or
459 * greater or equal to the number of items in the bucket, return 0.
460 *
461 * @param b the bucket containing items
462 * @param pos the zero based index of the item
463 *
464 * @returns the 16.16 fixed point item weight
465 */
466 extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
467 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
468 extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
469 extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
470 extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
471 extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
472 /** @ingroup API
473 *
474 * Deallocate a bucket created via crush_add_bucket().
475 *
476 * @param b the bucket to deallocate
477 */
478 extern void crush_destroy_bucket(struct crush_bucket *b);
479 /** @ingroup API
480 *
481 * Deallocate a rule created via crush_add_rule().
482 *
483 * @param r the rule to deallocate
484 */
485 extern void crush_destroy_rule(struct crush_rule *r);
486 /** @ingroup API
487 *
488 * Deallocate the __map__, previously allocated with crush_create.
489 *
490 * @param map the crush map
491 */
492 extern void crush_destroy(struct crush_map *map);
493
494 static inline int crush_calc_tree_node(int i)
495 {
496 return ((i+1) << 1)-1;
497 }
498
499 static inline const char *crush_alg_name(int alg)
500 {
501 switch (alg) {
502 case CRUSH_BUCKET_UNIFORM:
503 return "uniform";
504 case CRUSH_BUCKET_LIST:
505 return "list";
506 case CRUSH_BUCKET_TREE:
507 return "tree";
508 case CRUSH_BUCKET_STRAW:
509 return "straw";
510 case CRUSH_BUCKET_STRAW2:
511 return "straw2";
512 default:
513 return "unknown";
514 }
515 }
516
517 /* ---------------------------------------------------------------------
518 Private
519 --------------------------------------------------------------------- */
520
521 /* These data structures are private to the CRUSH implementation. They
522 are exposed in this header file because builder needs their
523 definitions to calculate the total working size.
524
525 Moving this out of the crush map allow us to treat the CRUSH map as
526 immutable within the mapper and removes the requirement for a CRUSH
527 map lock. */
528
529 struct crush_work_bucket {
530 __u32 perm_x; /* @x for which *perm is defined */
531 __u32 perm_n; /* num elements of *perm that are permuted/defined */
532 __u32 *perm; /* Permutation of the bucket's items */
533 } __attribute__ ((packed));
534
535 struct crush_work {
536 struct crush_work_bucket **work; /* Per-bucket working store */
537 };
538
539 #endif