]> git.proxmox.com Git - ceph.git/blame - ceph/src/crush/crush.h
update sources to 12.2.7
[ceph.git] / ceph / src / crush / crush.h
CommitLineData
7c673cae
FG
1#ifndef CEPH_CRUSH_CRUSH_H
2#define CEPH_CRUSH_CRUSH_H
3
4#ifdef __KERNEL__
5# include <linux/types.h>
6#else
7# include "crush_compat.h"
8#endif
9
10/*
11 * CRUSH is a pseudo-random data distribution algorithm that
12 * efficiently distributes input values (typically, data objects)
13 * across a heterogeneous, structured storage cluster.
14 *
15 * The algorithm was originally described in detail in this paper
16 * (although the algorithm has evolved somewhat since then):
17 *
18 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
19 *
20 * LGPL2
21 */
22
23
24#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
25
26#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
27#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
28#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
29
30#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
31#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
32
33#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
34/** @ingroup API
35 * The equivalent of NULL for an item, i.e. the absence of an item.
36 */
37#define CRUSH_ITEM_NONE 0x7fffffff
38
39/*
40 * CRUSH uses user-defined "rules" to describe how inputs should be
41 * mapped to devices. A rule consists of sequence of steps to perform
42 * to generate the set of output devices.
43 */
44struct crush_rule_step {
45 __u32 op;
46 __s32 arg1;
47 __s32 arg2;
48};
49
50/** @ingroup API
51 */
52enum crush_opcodes {
53 /*! do nothing
54 */
55 CRUSH_RULE_NOOP = 0,
56 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
57 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
58 /* arg2 = type */
59 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
60 CRUSH_RULE_EMIT = 4, /* no args */
61 CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
62 CRUSH_RULE_CHOOSELEAF_INDEP = 7,
63
64 CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
65 CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
66 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
67 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
68 CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
69 CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
70};
71
72/*
73 * for specifying choose num (arg1) relative to the max parameter
74 * passed to do_rule
75 */
76#define CRUSH_CHOOSE_N 0
77#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
78
79/*
80 * The rule mask is used to describe what the rule is intended for.
81 * Given a ruleset and size of output set, we search through the
82 * rule list for a matching rule_mask.
83 */
84struct crush_rule_mask {
85 __u8 ruleset;
86 __u8 type;
87 __u8 min_size;
88 __u8 max_size;
89};
90
91struct crush_rule {
92 __u32 len;
93 struct crush_rule_mask mask;
94 struct crush_rule_step steps[0];
95};
96
97#define crush_rule_size(len) (sizeof(struct crush_rule) + \
98 (len)*sizeof(struct crush_rule_step))
99
100
101
102/*
103 * A bucket is a named container of other items (either devices or
104 * other buckets).
105 */
106
107/** @ingroup API
108 *
109 * Items within a bucket are chosen with crush_do_rule() using one of
110 * three algorithms representing a tradeoff between performance and
111 * reorganization efficiency. If you are unsure of which bucket type
112 * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
113 *
114 * The table summarizes how the speed of each option measures up
115 * against mapping stability when items are added or removed.
116 *
117 * Bucket Alg Speed Additions Removals
118 * ------------------------------------------------
119 * uniform O(1) poor poor
120 * list O(n) optimal poor
121 * straw2 O(n) optimal optimal
122 */
123enum crush_algorithm {
124 /*!
125 * Devices are rarely added individually in a large system.
126 * Instead, new storage is typically deployed in blocks of identical
127 * devices, often as an additional shelf in a server rack or perhaps
128 * an entire cabinet. Devices reaching their end of life are often
129 * similarly decommissioned as a set (individual failures aside),
130 * making it natural to treat them as a unit. CRUSH uniform buckets
131 * are used to represent an identical set of devices in such
132 * circumstances. The key advantage in doing so is performance
133 * related: CRUSH can map replicas into uniform buckets in constant
134 * time. In cases where the uniformity restrictions are not
135 * appropriate, other bucket types can be used. If the size of a
136 * uniform bucket changes, there is a complete reshuffling of data
137 * between devices, much like conventional hash-based distribution
138 * strategies.
139 */
140 CRUSH_BUCKET_UNIFORM = 1,
141 /*!
142 * List buckets structure their contents as a linked list, and
143 * can contain items with arbitrary weights. To place a
144 * replica, CRUSH begins at the head of the list with the most
145 * recently added item and compares its weight to the sum of
224ce89b 146 * all remaining items' weights. Depending on the value of
7c673cae
FG
147 * hash( x , r , item), either the current item is chosen with
148 * the appropriate probability, or the process continues
149 * recursively down the list. This is a natural and intuitive
150 * choice for an expanding cluster: either an object is
151 * relocated to the newest device with some appropriate
152 * probability, or it remains on the older devices as before.
153 * The result is optimal data migration when items are added
154 * to the bucket. Items removed from the middle or tail of the
155 * list, however, can result in a significant amount of
156 * unnecessary movement, making list buckets most suitable for
157 * circumstances in which they never (or very rarely) shrink.
158 */
159 CRUSH_BUCKET_LIST = 2,
160 /*! @cond INTERNAL */
161 CRUSH_BUCKET_TREE = 3,
162 CRUSH_BUCKET_STRAW = 4,
163 /*! @endcond */
164 /*!
165 * List and tree buckets are structured such that a limited
166 * number of hash values need to be calculated and compared to
167 * weights in order to select a bucket item. In doing so,
168 * they divide and conquer in a way that either gives certain
169 * items precedence (e. g., those at the beginning of a list)
170 * or obviates the need to consider entire subtrees of items
171 * at all. That improves the performance of the replica
172 * placement process, but can also introduce suboptimal
173 * reorganization behavior when the contents of a bucket
174 * change due an addition, removal, or re-weighting of an
175 * item.
176 *
224ce89b 177 * The straw2 bucket type allows all items to fairly "compete"
7c673cae
FG
178 * against each other for replica placement through a process
179 * analogous to a draw of straws. To place a replica, a straw
180 * of random length is drawn for each item in the bucket. The
181 * item with the longest straw wins. The length of each straw
182 * is initially a value in a fixed range. Each straw length
224ce89b 183 * is scaled by a factor based on the item's weight so that
7c673cae
FG
184 * heavily weighted items are more likely to win the draw.
185 * Although this process is almost twice as slow (on average)
186 * than a list bucket and even slower than a tree bucket
187 * (which scales logarithmically), straw2 buckets result in
188 * optimal data movement between nested items when modified.
189 */
190 CRUSH_BUCKET_STRAW2 = 5,
191};
192extern const char *crush_bucket_alg_name(int alg);
193
194/*
195 * although tree was a legacy algorithm, it has been buggy, so
196 * exclude it.
197 */
198#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
199 (1 << CRUSH_BUCKET_UNIFORM) | \
200 (1 << CRUSH_BUCKET_LIST) | \
201 (1 << CRUSH_BUCKET_STRAW))
202
203/** @ingroup API
204 *
205 * A bucket contains __size__ __items__ which are either positive
206 * numbers or negative numbers that reference other buckets and is
207 * uniquely identified with __id__ which is a negative number. The
208 * __weight__ of a bucket is the cumulative weight of all its
209 * children. A bucket is assigned a ::crush_algorithm that is used by
210 * crush_do_rule() to draw an item depending on its weight. A bucket
211 * can be assigned a strictly positive (> 0) __type__ defined by the
212 * caller. The __type__ can be used by crush_do_rule(), when it is
213 * given as an argument of a rule step.
214 *
215 * A pointer to crush_bucket can safely be cast into the following
216 * structure, depending on the value of __alg__:
217 *
218 * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
219 * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
220 * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
221 *
222 * The weight of each item depends on the algorithm and the
223 * information about it is available in the corresponding structure
224 * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
225 *
226 * See crush_map for more information on how __id__ is used
227 * to reference the bucket.
228 */
229struct crush_bucket {
230 __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */
231 __u16 type; /*!< > 0 bucket type, defined by the caller */
232 __u8 alg; /*!< the item selection ::crush_algorithm */
233 /*! @cond INTERNAL */
234 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
235 /*! @endcond */
236 __u32 weight; /*!< 16.16 fixed point cumulated children weight */
237 __u32 size; /*!< size of the __items__ array */
238 __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */
239};
240
241/** @ingroup API
242 *
243 * Replacement weights for each item in a bucket. The size of the
244 * array must be exactly the size of the straw2 bucket, just as the
245 * item_weights array.
246 *
247 */
248struct crush_weight_set {
249 __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
250 __u32 size; /*!< size of the __weights__ array */
251};
252
253/** @ingroup API
254 *
255 * Replacement weights and ids for a given straw2 bucket, for
256 * placement purposes.
257 *
258 * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
259 * replacement weights found at __weight_set[N]__ are used instead of
260 * the weights from __item_weights__. If __N__ is greater than
28e407b8 261 * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
7c673cae
FG
262 * used instead. For instance if __weight_set__ is:
263 *
264 * [ [ 0x10000, 0x20000 ], // position 0
265 * [ 0x20000, 0x40000 ] ] // position 1
266 *
267 * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
268 * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
269 * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
270 * etc.
271 *
272 */
273struct crush_choose_arg {
224ce89b 274 __s32 *ids; /*!< values to use instead of items */
7c673cae
FG
275 __u32 ids_size; /*!< size of the __ids__ array */
276 struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
28e407b8 277 __u32 weight_set_positions; /*!< size of the __weight_set__ array */
7c673cae
FG
278};
279
280/** @ingroup API
281 *
282 * Replacement weights and ids for each bucket in the crushmap. The
283 * __size__ of the __args__ array must be exactly the same as the
284 * __map->max_buckets__.
285 *
286 * The __crush_choose_arg__ at index N will be used when choosing
287 * an item from the bucket __map->buckets[N]__ bucket, provided it
288 * is a straw2 bucket.
289 *
290 */
291struct crush_choose_arg_map {
292 struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
293 __u32 size; /*!< size of the __args__ array */
294};
295
296/** @ingroup API
297 * The weight of each item in the bucket when
298 * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
299 */
300struct crush_bucket_uniform {
301 struct crush_bucket h; /*!< generic bucket information */
302 __u32 item_weight; /*!< 16.16 fixed point weight for each item */
303};
304
305/** @ingroup API
306 * The weight of each item in the bucket when
307 * __h.alg__ == ::CRUSH_BUCKET_LIST.
308 *
309 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
310 * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
311 * for j in [0,i[.
312 *
313 */
314struct crush_bucket_list {
315 struct crush_bucket h; /*!< generic bucket information */
316 __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
317 __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */
318};
319
320struct crush_bucket_tree {
321 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
322 actual items */
323 __u8 num_nodes;
324 __u32 *node_weights;
325};
326
327struct crush_bucket_straw {
328 struct crush_bucket h;
329 __u32 *item_weights; /* 16-bit fixed point */
330 __u32 *straws; /* 16-bit fixed point */
331};
332
333/** @ingroup API
334 * The weight of each item in the bucket when
335 * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
336 *
337 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
338 * [0,__h.size__[.
339 */
340struct crush_bucket_straw2 {
341 struct crush_bucket h; /*!< generic bucket information */
342 __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
343};
344
345
346
347/** @ingroup API
348 *
349 * A crush map define a hierarchy of crush_bucket that end with leaves
350 * (buckets and leaves are called items) and a set of crush_rule to
351 * map an integer to items with the crush_do_rule() function.
352 *
353 */
354struct crush_map {
355 /*! An array of crush_bucket pointers of size __max_buckets__.
356 * An element of the array may be NULL if the bucket was removed with
357 * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
358 * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
359 */
360 struct crush_bucket **buckets;
361 /*! An array of crush_rule pointers of size __max_rules__.
362 * An element of the array may be NULL if the rule was removed (there is
363 * no API to do so but there may be one in the future). The rules must be added
364 * with crush_add_rule().
365 */
366 struct crush_rule **rules;
367 __s32 max_buckets; /*!< the size of __buckets__ */
368 __u32 max_rules; /*!< the size of __rules__ */
369 /*! The value of the highest item stored in the crush_map + 1
370 */
371 __s32 max_devices;
372
373 /*! Backward compatibility tunable. It implements a bad solution
374 * and must always be set to 0 except for backward compatibility
375 * purposes
376 */
377 __u32 choose_local_tries;
378 /*! Backward compatibility tunable. It implements a bad solution
379 * and must always be set to 0 except for backward compatibility
380 * purposes
381 */
382 __u32 choose_local_fallback_tries;
383 /*! Tunable. The default value when the CHOOSE_TRIES or
384 * CHOOSELEAF_TRIES steps are omitted in a rule. See the
385 * documentation for crush_rule_set_step() for more
386 * information
387 */
388 __u32 choose_total_tries;
389 /*! Backward compatibility tunable. It should always be set
390 * to 1 except for backward compatibility. Implemented in 2012
391 * it was generalized late 2013 and is mostly unused except
392 * in one border case, reason why it must be set to 1.
393 *
394 * Attempt chooseleaf inner descent once for firstn mode; on
395 * reject retry outer descent. Note that this does *not*
396 * apply to a collision: in that case we will retry as we
397 * used to.
398 */
399 __u32 chooseleaf_descend_once;
400 /*! Backward compatibility tunable. It is a fix for bad
401 * mappings implemented in 2014 at
402 * https://github.com/ceph/ceph/pull/1185. It should always
403 * be set to 1 except for backward compatibility.
404 *
405 * If non-zero, feed r into chooseleaf, bit-shifted right by
406 * (r-1) bits. a value of 1 is best for new clusters. for
407 * legacy clusters that want to limit reshuffling, a value of
408 * 3 or 4 will make the mappings line up a bit better with
409 * previous mappings.
410 */
411 __u8 chooseleaf_vary_r;
412
413 /*! Backward compatibility tunable. It is an improvement that
414 * avoids unnecessary mapping changes, implemented at
415 * https://github.com/ceph/ceph/pull/6572 and explained in
416 * this post: "chooseleaf may cause some unnecessary pg
417 * migrations" in October 2015
418 * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
419 * It should always be set to 1 except for backward compatibility.
420 */
421 __u8 chooseleaf_stable;
422
423 /*! @cond INTERNAL */
424 /* This value is calculated after decode or construction by
425 the builder. It is exposed here (rather than having a
426 'build CRUSH working space' function) so that callers can
427 reserve a static buffer, allocate space on the stack, or
428 otherwise avoid calling into the heap allocator if they
429 want to. The size of the working space depends on the map,
430 while the size of the scratch vector passed to the mapper
431 depends on the size of the desired result set.
432
433 Nothing stops the caller from allocating both in one swell
434 foop and passing in two points, though. */
435 size_t working_size;
436
437#ifndef __KERNEL__
438 /*! @endcond */
439 /*! Backward compatibility tunable. It is a fix for the straw
440 * scaler values for the straw algorithm which is deprecated
441 * (straw2 replaces it) implemented at
442 * https://github.com/ceph/ceph/pull/3057. It should always
443 * be set to 1 except for backward compatibility.
444 *
445 */
446 __u8 straw_calc_version;
447
448 /*! @cond INTERNAL */
449 /*
450 * allowed bucket algs is a bitmask, here the bit positions
451 * are CRUSH_BUCKET_*. note that these are *bits* and
452 * CRUSH_BUCKET_* values are not, so we need to or together (1
453 * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
454 * minimize confusion (bucket type values start at 1).
455 */
456 __u32 allowed_bucket_algs;
457
458 __u32 *choose_tries;
459#endif
460 /*! @endcond */
461};
462
463
464/* crush.c */
465/** @ingroup API
466 *
467 * Return the 16.16 fixed point weight of the item at __pos__ (zero
468 * based index) within the bucket __b__. If __pos__ is negative or
469 * greater or equal to the number of items in the bucket, return 0.
470 *
471 * @param b the bucket containing items
472 * @param pos the zero based index of the item
473 *
474 * @returns the 16.16 fixed point item weight
475 */
476extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
477extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
478extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
479extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
480extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
481extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
482/** @ingroup API
483 *
484 * Deallocate a bucket created via crush_add_bucket().
485 *
486 * @param b the bucket to deallocate
487 */
488extern void crush_destroy_bucket(struct crush_bucket *b);
489/** @ingroup API
490 *
491 * Deallocate a rule created via crush_add_rule().
492 *
493 * @param r the rule to deallocate
494 */
495extern void crush_destroy_rule(struct crush_rule *r);
496/** @ingroup API
497 *
498 * Deallocate the __map__, previously allocated with crush_create.
499 *
500 * @param map the crush map
501 */
502extern void crush_destroy(struct crush_map *map);
503
504static inline int crush_calc_tree_node(int i)
505{
506 return ((i+1) << 1)-1;
507}
508
31f18b77
FG
509static inline const char *crush_alg_name(int alg)
510{
511 switch (alg) {
512 case CRUSH_BUCKET_UNIFORM:
513 return "uniform";
514 case CRUSH_BUCKET_LIST:
515 return "list";
516 case CRUSH_BUCKET_TREE:
517 return "tree";
518 case CRUSH_BUCKET_STRAW:
519 return "straw";
520 case CRUSH_BUCKET_STRAW2:
521 return "straw2";
522 default:
523 return "unknown";
524 }
525}
526
7c673cae
FG
527/* ---------------------------------------------------------------------
528 Private
529 --------------------------------------------------------------------- */
530
531/* These data structures are private to the CRUSH implementation. They
532 are exposed in this header file because builder needs their
533 definitions to calculate the total working size.
534
535 Moving this out of the crush map allow us to treat the CRUSH map as
536 immutable within the mapper and removes the requirement for a CRUSH
537 map lock. */
538
539struct crush_work_bucket {
540 __u32 perm_x; /* @x for which *perm is defined */
541 __u32 perm_n; /* num elements of *perm that are permuted/defined */
542 __u32 *perm; /* Permutation of the bucket's items */
543};
544
545struct crush_work {
546 struct crush_work_bucket **work; /* Per-bucket working store */
547};
548
549#endif