1 #ifndef CEPH_CRUSH_CRUSH_H
2 #define CEPH_CRUSH_CRUSH_H
5 # include <linux/types.h>
7 # include "crush_compat.h"
11 * CRUSH is a pseudo-random data distribution algorithm that
12 * efficiently distributes input values (typically, data objects)
13 * across a heterogeneous, structured storage cluster.
15 * The algorithm was originally described in detail in this paper
16 * (although the algorithm has evolved somewhat since then):
18 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
20 * LGPL-2.1 or LGPL-3.0
24 #define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
26 #define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
27 #define CRUSH_MAX_RULES (1<<8) /* max crush rule id */
29 #define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
30 #define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
32 #define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
34 * The equivalent of NULL for an item, i.e. the absence of an item.
36 #define CRUSH_ITEM_NONE 0x7fffffff
39 * CRUSH uses user-defined "rules" to describe how inputs should be
40 * mapped to devices. A rule consists of sequence of steps to perform
41 * to generate the set of output devices.
43 struct crush_rule_step
{
55 CRUSH_RULE_TAKE
= 1, /* arg1 = value to start with */
56 CRUSH_RULE_CHOOSE_FIRSTN
= 2, /* arg1 = num items to pick */
58 CRUSH_RULE_CHOOSE_INDEP
= 3, /* same */
59 CRUSH_RULE_EMIT
= 4, /* no args */
60 CRUSH_RULE_CHOOSELEAF_FIRSTN
= 6,
61 CRUSH_RULE_CHOOSELEAF_INDEP
= 7,
63 CRUSH_RULE_SET_CHOOSE_TRIES
= 8, /* override choose_total_tries */
64 CRUSH_RULE_SET_CHOOSELEAF_TRIES
= 9, /* override chooseleaf_descend_once */
65 CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES
= 10,
66 CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES
= 11,
67 CRUSH_RULE_SET_CHOOSELEAF_VARY_R
= 12,
68 CRUSH_RULE_SET_CHOOSELEAF_STABLE
= 13
72 * for specifying choose num (arg1) relative to the max parameter
75 #define CRUSH_CHOOSE_N 0
76 #define CRUSH_CHOOSE_N_MINUS(x) (-(x))
80 __u8 __unused_was_rule_mask_ruleset
;
82 __u8 deprecated_min_size
;
83 __u8 deprecated_max_size
;
84 struct crush_rule_step steps
[0];
87 #define crush_rule_size(len) (sizeof(struct crush_rule) + \
88 (len)*sizeof(struct crush_rule_step))
93 * A bucket is a named container of other items (either devices or
99 * Items within a bucket are chosen with crush_do_rule() using one of
100 * three algorithms representing a tradeoff between performance and
101 * reorganization efficiency. If you are unsure of which bucket type
102 * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
104 * The table summarizes how the speed of each option measures up
105 * against mapping stability when items are added or removed.
107 * Bucket Alg Speed Additions Removals
108 * ------------------------------------------------
109 * uniform O(1) poor poor
110 * list O(n) optimal poor
111 * straw2 O(n) optimal optimal
113 enum crush_algorithm
{
115 * Devices are rarely added individually in a large system.
116 * Instead, new storage is typically deployed in blocks of identical
117 * devices, often as an additional shelf in a server rack or perhaps
118 * an entire cabinet. Devices reaching their end of life are often
119 * similarly decommissioned as a set (individual failures aside),
120 * making it natural to treat them as a unit. CRUSH uniform buckets
121 * are used to represent an identical set of devices in such
122 * circumstances. The key advantage in doing so is performance
123 * related: CRUSH can map replicas into uniform buckets in constant
124 * time. In cases where the uniformity restrictions are not
125 * appropriate, other bucket types can be used. If the size of a
126 * uniform bucket changes, there is a complete reshuffling of data
127 * between devices, much like conventional hash-based distribution
130 CRUSH_BUCKET_UNIFORM
= 1,
132 * List buckets structure their contents as a linked list, and
133 * can contain items with arbitrary weights. To place a
134 * replica, CRUSH begins at the head of the list with the most
135 * recently added item and compares its weight to the sum of
136 * all remaining items' weights. Depending on the value of
137 * hash( x , r , item), either the current item is chosen with
138 * the appropriate probability, or the process continues
139 * recursively down the list. This is a natural and intuitive
140 * choice for an expanding cluster: either an object is
141 * relocated to the newest device with some appropriate
142 * probability, or it remains on the older devices as before.
143 * The result is optimal data migration when items are added
144 * to the bucket. Items removed from the middle or tail of the
145 * list, however, can result in a significant amount of
146 * unnecessary movement, making list buckets most suitable for
147 * circumstances in which they never (or very rarely) shrink.
149 CRUSH_BUCKET_LIST
= 2,
150 /*! @cond INTERNAL */
151 CRUSH_BUCKET_TREE
= 3,
152 CRUSH_BUCKET_STRAW
= 4,
155 * List and tree buckets are structured such that a limited
156 * number of hash values need to be calculated and compared to
157 * weights in order to select a bucket item. In doing so,
158 * they divide and conquer in a way that either gives certain
159 * items precedence (e. g., those at the beginning of a list)
160 * or obviates the need to consider entire subtrees of items
161 * at all. That improves the performance of the replica
162 * placement process, but can also introduce suboptimal
163 * reorganization behavior when the contents of a bucket
164 * change due an addition, removal, or re-weighting of an
167 * The straw2 bucket type allows all items to fairly "compete"
168 * against each other for replica placement through a process
169 * analogous to a draw of straws. To place a replica, a straw
170 * of random length is drawn for each item in the bucket. The
171 * item with the longest straw wins. The length of each straw
172 * is initially a value in a fixed range. Each straw length
173 * is scaled by a factor based on the item's weight so that
174 * heavily weighted items are more likely to win the draw.
175 * Although this process is almost twice as slow (on average)
176 * than a list bucket and even slower than a tree bucket
177 * (which scales logarithmically), straw2 buckets result in
178 * optimal data movement between nested items when modified.
180 CRUSH_BUCKET_STRAW2
= 5,
182 extern const char *crush_bucket_alg_name(int alg
);
185 * although tree was a legacy algorithm, it has been buggy, so
188 #define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
189 (1 << CRUSH_BUCKET_UNIFORM) | \
190 (1 << CRUSH_BUCKET_LIST) | \
191 (1 << CRUSH_BUCKET_STRAW))
195 * A bucket contains __size__ __items__ which are either positive
196 * numbers or negative numbers that reference other buckets and is
197 * uniquely identified with __id__ which is a negative number. The
198 * __weight__ of a bucket is the cumulative weight of all its
199 * children. A bucket is assigned a ::crush_algorithm that is used by
200 * crush_do_rule() to draw an item depending on its weight. A bucket
201 * can be assigned a strictly positive (> 0) __type__ defined by the
202 * caller. The __type__ can be used by crush_do_rule(), when it is
203 * given as an argument of a rule step.
205 * A pointer to crush_bucket can safely be cast into the following
206 * structure, depending on the value of __alg__:
208 * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
209 * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
210 * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
212 * The weight of each item depends on the algorithm and the
213 * information about it is available in the corresponding structure
214 * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
216 * See crush_map for more information on how __id__ is used
217 * to reference the bucket.
219 struct crush_bucket
{
220 __s32 id
; /*!< bucket identifier, < 0 and unique within a crush_map */
221 __u16 type
; /*!< > 0 bucket type, defined by the caller */
222 __u8 alg
; /*!< the item selection ::crush_algorithm */
223 /*! @cond INTERNAL */
224 __u8 hash
; /* which hash function to use, CRUSH_HASH_* */
226 __u32 weight
; /*!< 16.16 fixed point cumulated children weight */
227 __u32 size
; /*!< size of the __items__ array */
228 __s32
*items
; /*!< array of children: < 0 are buckets, >= 0 items */
233 * Replacement weights for each item in a bucket. The size of the
234 * array must be exactly the size of the straw2 bucket, just as the
235 * item_weights array.
238 struct crush_weight_set
{
239 __u32
*weights
; /*!< 16.16 fixed point weights in the same order as items */
240 __u32 size
; /*!< size of the __weights__ array */
245 * Replacement weights and ids for a given straw2 bucket, for
246 * placement purposes.
248 * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
249 * replacement weights found at __weight_set[N]__ are used instead of
250 * the weights from __item_weights__. If __N__ is greater than
251 * __weight_set_positions__, the weights found at __weight_set_positions-1__ are
252 * used instead. For instance if __weight_set__ is:
254 * [ [ 0x10000, 0x20000 ], // position 0
255 * [ 0x20000, 0x40000 ] ] // position 1
257 * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
258 * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
259 * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
263 struct crush_choose_arg
{
264 __s32
*ids
; /*!< values to use instead of items */
265 __u32 ids_size
; /*!< size of the __ids__ array */
266 struct crush_weight_set
*weight_set
; /*!< weight replacements for a given position */
267 __u32 weight_set_positions
; /*!< size of the __weight_set__ array */
272 * Replacement weights and ids for each bucket in the crushmap. The
273 * __size__ of the __args__ array must be exactly the same as the
274 * __map->max_buckets__.
276 * The __crush_choose_arg__ at index N will be used when choosing
277 * an item from the bucket __map->buckets[N]__ bucket, provided it
278 * is a straw2 bucket.
281 struct crush_choose_arg_map
{
282 struct crush_choose_arg
*args
; /*!< replacement for each bucket in the crushmap */
283 __u32 size
; /*!< size of the __args__ array */
287 * The weight of each item in the bucket when
288 * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
290 struct crush_bucket_uniform
{
291 struct crush_bucket h
; /*!< generic bucket information */
292 __u32 item_weight
; /*!< 16.16 fixed point weight for each item */
296 * The weight of each item in the bucket when
297 * __h.alg__ == ::CRUSH_BUCKET_LIST.
299 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
300 * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
304 struct crush_bucket_list
{
305 struct crush_bucket h
; /*!< generic bucket information */
306 __u32
*item_weights
; /*!< 16.16 fixed point weight for each item */
307 __u32
*sum_weights
; /*!< 16.16 fixed point sum of the weights */
310 struct crush_bucket_tree
{
311 struct crush_bucket h
; /* note: h.size is _tree_ size, not number of
317 struct crush_bucket_straw
{
318 struct crush_bucket h
;
319 __u32
*item_weights
; /* 16-bit fixed point */
320 __u32
*straws
; /* 16-bit fixed point */
324 * The weight of each item in the bucket when
325 * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
327 * The weight of __h.items[i]__ is __item_weights[i]__ for i in
330 struct crush_bucket_straw2
{
331 struct crush_bucket h
; /*!< generic bucket information */
332 __u32
*item_weights
; /*!< 16.16 fixed point weight for each item */
339 * A crush map define a hierarchy of crush_bucket that end with leaves
340 * (buckets and leaves are called items) and a set of crush_rule to
341 * map an integer to items with the crush_do_rule() function.
345 /*! An array of crush_bucket pointers of size __max_buckets__.
346 * An element of the array may be NULL if the bucket was removed with
347 * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
348 * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
350 struct crush_bucket
**buckets
;
351 /*! An array of crush_rule pointers of size __max_rules__.
352 * An element of the array may be NULL if the rule was removed (there is
353 * no API to do so but there may be one in the future). The rules must be added
354 * with crush_add_rule().
356 struct crush_rule
**rules
;
357 __s32 max_buckets
; /*!< the size of __buckets__ */
358 __u32 max_rules
; /*!< the size of __rules__ */
359 /*! The value of the highest item stored in the crush_map + 1
363 /*! Backward compatibility tunable. It implements a bad solution
364 * and must always be set to 0 except for backward compatibility
367 __u32 choose_local_tries
;
368 /*! Backward compatibility tunable. It implements a bad solution
369 * and must always be set to 0 except for backward compatibility
372 __u32 choose_local_fallback_tries
;
373 /*! Tunable. The default value when the CHOOSE_TRIES or
374 * CHOOSELEAF_TRIES steps are omitted in a rule. See the
375 * documentation for crush_rule_set_step() for more
378 __u32 choose_total_tries
;
379 /*! Backward compatibility tunable. It should always be set
380 * to 1 except for backward compatibility. Implemented in 2012
381 * it was generalized late 2013 and is mostly unused except
382 * in one border case, reason why it must be set to 1.
384 * Attempt chooseleaf inner descent once for firstn mode; on
385 * reject retry outer descent. Note that this does *not*
386 * apply to a collision: in that case we will retry as we
389 __u32 chooseleaf_descend_once
;
390 /*! Backward compatibility tunable. It is a fix for bad
391 * mappings implemented in 2014 at
392 * https://github.com/ceph/ceph/pull/1185. It should always
393 * be set to 1 except for backward compatibility.
395 * If non-zero, feed r into chooseleaf, bit-shifted right by
396 * (r-1) bits. a value of 1 is best for new clusters. for
397 * legacy clusters that want to limit reshuffling, a value of
398 * 3 or 4 will make the mappings line up a bit better with
401 __u8 chooseleaf_vary_r
;
403 /*! Backward compatibility tunable. It is an improvement that
404 * avoids unnecessary mapping changes, implemented at
405 * https://github.com/ceph/ceph/pull/6572 and explained in
406 * this post: "chooseleaf may cause some unnecessary pg
407 * migrations" in October 2015
408 * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
409 * It should always be set to 1 except for backward compatibility.
411 __u8 chooseleaf_stable
;
413 /*! @cond INTERNAL */
414 /* This value is calculated after decode or construction by
415 the builder. It is exposed here (rather than having a
416 'build CRUSH working space' function) so that callers can
417 reserve a static buffer, allocate space on the stack, or
418 otherwise avoid calling into the heap allocator if they
419 want to. The size of the working space depends on the map,
420 while the size of the scratch vector passed to the mapper
421 depends on the size of the desired result set.
423 Nothing stops the caller from allocating both in one swell
424 foop and passing in two points, though. */
429 /*! Backward compatibility tunable. It is a fix for the straw
430 * scaler values for the straw algorithm which is deprecated
431 * (straw2 replaces it) implemented at
432 * https://github.com/ceph/ceph/pull/3057. It should always
433 * be set to 1 except for backward compatibility.
436 __u8 straw_calc_version
;
438 /*! @cond INTERNAL */
440 * allowed bucket algs is a bitmask, here the bit positions
441 * are CRUSH_BUCKET_*. note that these are *bits* and
442 * CRUSH_BUCKET_* values are not, so we need to or together (1
443 * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
444 * minimize confusion (bucket type values start at 1).
446 __u32 allowed_bucket_algs
;
457 * Return the 16.16 fixed point weight of the item at __pos__ (zero
458 * based index) within the bucket __b__. If __pos__ is negative or
459 * greater or equal to the number of items in the bucket, return 0.
461 * @param b the bucket containing items
462 * @param pos the zero based index of the item
464 * @returns the 16.16 fixed point item weight
466 extern int crush_get_bucket_item_weight(const struct crush_bucket
*b
, int pos
);
467 extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform
*b
);
468 extern void crush_destroy_bucket_list(struct crush_bucket_list
*b
);
469 extern void crush_destroy_bucket_tree(struct crush_bucket_tree
*b
);
470 extern void crush_destroy_bucket_straw(struct crush_bucket_straw
*b
);
471 extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2
*b
);
474 * Deallocate a bucket created via crush_add_bucket().
476 * @param b the bucket to deallocate
478 extern void crush_destroy_bucket(struct crush_bucket
*b
);
481 * Deallocate a rule created via crush_add_rule().
483 * @param r the rule to deallocate
485 extern void crush_destroy_rule(struct crush_rule
*r
);
488 * Deallocate the __map__, previously allocated with crush_create.
490 * @param map the crush map
492 extern void crush_destroy(struct crush_map
*map
);
494 static inline int crush_calc_tree_node(int i
)
496 return ((i
+1) << 1)-1;
499 static inline const char *crush_alg_name(int alg
)
502 case CRUSH_BUCKET_UNIFORM
:
504 case CRUSH_BUCKET_LIST
:
506 case CRUSH_BUCKET_TREE
:
508 case CRUSH_BUCKET_STRAW
:
510 case CRUSH_BUCKET_STRAW2
:
517 /* ---------------------------------------------------------------------
519 --------------------------------------------------------------------- */
521 /* These data structures are private to the CRUSH implementation. They
522 are exposed in this header file because builder needs their
523 definitions to calculate the total working size.
525 Moving this out of the crush map allow us to treat the CRUSH map as
526 immutable within the mapper and removes the requirement for a CRUSH
529 struct crush_work_bucket
{
530 __u32 perm_x
; /* @x for which *perm is defined */
531 __u32 perm_n
; /* num elements of *perm that are permuted/defined */
532 __u32
*perm
; /* Permutation of the bucket's items */
533 } __attribute__ ((packed
));
536 struct crush_work_bucket
**work
; /* Per-bucket working store */