]> git.proxmox.com Git - ceph.git/blob - ceph/src/crush/builder.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / crush / builder.h
1 #ifndef CEPH_CRUSH_BUILDER_H
2 #define CEPH_CRUSH_BUILDER_H
3
4 #include "crush.h"
5
6 /** @ingroup API
7 *
8 * Allocate a crush_map with __malloc(3)__ and initialize it. The
9 * caller is responsible for deallocating the crush_map with
10 * crush_destroy().
11 *
12 * The content of the allocated crush_map is set with
13 * set_optimal_crush_map(). The caller is responsible for setting each
14 * tunable in the __crush_map__ for backward compatibility or mapping
15 * stability.
16 *
17 * @returns a pointer to the newly created crush_map or NULL
18 */
19 extern struct crush_map *crush_create();
20 /** @ingroup API
21 *
22 * Analyze the content of __map__ and set the internal values required
23 * before it can be used to map values with crush_do_rule(). The caller
24 * must make sure it is run before crush_do_rule() and after any
25 * function that modifies the __map__ (crush_add_bucket(), etc.).
26 *
27 * @param map the crush_map
28 */
29 extern void crush_finalize(struct crush_map *map);
30
31 /* rules */
32 /** @ingroup API
33 *
34 * Allocate an empty crush_rule structure large enough to store __len__ steps.
35 * Steps can be added to a rule via crush_rule_set_step(). The __ruleset__
36 * is a user defined integer, not used by __libcrush__ and stored in
37 * the allocated rule at __rule->mask.ruleset__.
38 *
39 * The rule is designed to allow crush_do_rule() to get at least __minsize__ items
40 * and at most __maxsize__ items.
41 *
42 * The __type__ is defined by the caller and will be used by
43 * crush_find_rule() when looking for a rule and by
44 * __CRUSH_RULE_CHOOSE*__ steps when looking for items.
45 *
46 * The caller is responsible for deallocating the returned pointer via
47 * crush_destroy_rule().
48 *
49 * If __malloc(3)__ fails, return NULL.
50 *
51 * @param len number of steps in the rule
52 * @param ruleset user defined value
53 * @param type user defined value
54 * @param minsize minimum number of items the rule can map
55 * @param maxsize maximum number of items the rule can map
56 *
57 * @returns a pointer to the newly created rule or NULL
58 */
59 extern struct crush_rule *crush_make_rule(int len, int ruleset, int type, int minsize, int maxsize);
60 /** @ingroup API
61 *
62 * Set the __pos__ step of the __rule__ to an operand and up to two arguments.
63 * The value of the operand __op__ determines if the arguments are used and how:
64 *
65 * - __CRUSH_RULE_NOOP__ do nothing.
66 * - __CRUSH_RULE_TAKE__ select the __arg1__ item
67 * - __CRUSH_RULE_EMIT__ append the selection to the results and clear
68 * the selection
69 *
70 * - __CRUSH_RULE_CHOOSE_FIRSTN__ and __CRUSH_RULE_CHOOSE_INDEP__
71 * recursively explore each bucket currently selected, looking for
72 * __arg1__ items of type __arg2__ and select them.
73 * - __CRUSH_RULE_CHOOSELEAF_FIRSTN__ and __CRUSH_RULE_CHOOSELEAF_INDEP__
74 * recursively explore each bucket currently selected, looking for
75 * __arg1__ leaves within all the buckets of type __arg2__ and
76 * select them.
77 *
78 * In all __CHOOSE__ steps, if __arg1__ is less than or equal to zero,
79 * the number of items to select is equal to the __max_result__ argument
80 * of crush_do_rule() minus __arg1__. It is common to set __arg1__ to zero
81 * to select as many items as requested by __max_result__.
82 *
83 * - __CRUSH_RULE_SET_CHOOSE_TRIES__ and __CRUSH_RULE_SET_CHOOSELEAF_TRIES__
84 *
85 * The CHOOSE_FIRSTN and CHOOSE_INDEP rule step look for buckets of
86 * a given type, randomly selecting them. If they are unlucky and
87 * find the same bucket twice, they will try N+1 times (N being the
88 * value of the choose_total_tries tunable). If there is a previous
89 * SET_CHOOSE_TRIES step in the same rule, it will try C times
90 * instead (C being the value of the argument of the
91 * SET_CHOOSE_TRIES step).
92 *
93 * Note: the __choose_total_tries__ tunable defined in crush_map is
94 * the number of retry, not the number of tries. The number of tries
95 * is the number of retry+1. The SET_CHOOSE_TRIES rule step sets the
96 * number of tries and does not need the + 1. This confusing
97 * difference is inherited from an off-by-one bug from years ago.
98 *
99 * The CHOOSELEAF_FIRSTN and CHOOSELEAF_INDEP rule step do the same
100 * as CHOOSE_FIRSTN and CHOOSE_INDEP but also recursively explore
101 * each bucket found, looking for a single device. The same device
102 * may be found in two different buckets because the crush map is
103 * not a strict hierarchy, it is a DAG. When such a collision
104 * happens, they will try again. The number of times they try to
105 * find a non colliding device is:
106 *
107 * - If FIRSTN and there is no previous SET_CHOOSELEAF_TRIES rule
108 * step: try N + 1 times (N being the value of the
109 * __choose_total_tries__ tunable defined in crush_map)
110 *
111 * - If FIRSTN and there is a previous SET_CHOOSELEAF_TRIES rule
112 * step: try P times (P being the value of the argument of the
113 * SET_CHOOSELEAF_TRIES rule step)
114 *
115 * - If INDEP and there is no previous SET_CHOOSELEAF_TRIES rule
116 * step: try 1 time.
117 *
118 * - If INDEP and there is a previous SET_CHOOSELEAF_TRIES rule step: try
119 * P times (P being the value of the argument of the SET_CHOOSELEAF_TRIES
120 * rule step)
121 *
122 * @param rule the rule in which the step is inserted
123 * @param pos the zero based step index
124 * @param op one of __CRUSH_RULE_NOOP__, __CRUSH_RULE_TAKE__, __CRUSH_RULE_CHOOSE_FIRSTN__, __CRUSH_RULE_CHOOSE_INDEP__, __CRUSH_RULE_CHOOSELEAF_FIRSTN__, __CRUSH_RULE_CHOOSELEAF_INDEP__, __CRUSH_RULE_SET_CHOOSE_TRIES__, __CRUSH_RULE_SET_CHOOSELEAF_TRIES__ or __CRUSH_RULE_EMIT__
125 * @param arg1 first argument for __op__
126 * @param arg2 second argument for __op__
127 */
128 extern void crush_rule_set_step(struct crush_rule *rule, int pos, int op, int arg1, int arg2);
129 /** @ingroup API
130 *
131 * Add the __rule__ into the crush __map__ and assign it the
132 * __ruleno__ unique identifier. If __ruleno__ is -1, the function will
133 * assign the lowest available identifier. The __ruleno__ value must be
134 * a positive integer lower than __CRUSH_MAX_RULES__.
135 *
136 * - return -ENOSPC if the rule identifier is >= __CRUSH_MAX_RULES__
137 * - return -ENOMEM if __realloc(3)__ fails to expand the array of
138 * rules in the __map__
139 *
140 * @param map the crush_map
141 * @param rule the rule to add to the __map__
142 * @param ruleno a positive integer < __CRUSH_MAX_RULES__ or -1
143 *
144 * @returns the rule unique identifier on success, < 0 on error
145 */
146 extern int crush_add_rule(struct crush_map *map, struct crush_rule *rule, int ruleno);
147
148 /* buckets */
149 extern int crush_get_next_bucket_id(struct crush_map *map);
150 /** @ingroup API
151 *
152 * Add __bucket__ into the crush __map__ and assign it the
153 * __bucketno__ unique identifier. If __bucketno__ is 0, the function
154 * will assign the lowest available identifier. The bucket identifier
155 * must be a negative integer. The bucket identifier is returned via
156 * __idout__.
157 *
158 * - return -ENOMEM if __realloc(3)__ fails to expand the array of
159 * buckets in the __map__
160 * - return -EEXIST if the __bucketno__ identifier is already assigned
161 * to another bucket.
162 *
163 * @param[in] map the crush_map
164 * @param[in] bucketno the bucket unique identifer or 0
165 * @param[in] bucket the bucket to add to the __map__
166 * @param[out] idout a pointer to the bucket identifier
167 *
168 * @returns 0 on success, < 0 on error
169 */
170 extern int crush_add_bucket(struct crush_map *map,
171 int bucketno,
172 struct crush_bucket *bucket, int *idout);
173 /** @ingroup API
174 *
175 * Allocate a crush_bucket with __malloc(3)__ and initialize it. The
176 * content of the bucket is filled with __size__ items from
177 * __items__. The item selection is set to use __alg__ which is one of
178 * ::CRUSH_BUCKET_UNIFORM , ::CRUSH_BUCKET_LIST or
179 * ::CRUSH_BUCKET_STRAW2. The initial __items__ are assigned a
180 * weight from the __weights__ array, depending on the value of
181 * __alg__. If __alg__ is ::CRUSH_BUCKET_UNIFORM, all items are set
182 * to have a weight equal to __weights[0]__, otherwise the weight of
183 * __items[x]__ is set to be the value of __weights[x]__.
184 *
185 * The caller is responsible for deallocating the returned pointer via
186 * crush_destroy_bucket().
187 *
188 * @param map __unused__
189 * @param alg algorithm for item selection
190 * @param hash always set to CRUSH_HASH_RJENKINS1
191 * @param type user defined bucket type
192 * @param size of the __items__ array
193 * @param items array of __size__ items
194 * @param weights the weight of each item in __items__, depending on __alg__
195 *
196 * @returns a pointer to the newly created bucket or NULL
197 */
198 struct crush_bucket *crush_make_bucket(struct crush_map *map, int alg, int hash, int type, int size, int *items, int *weights);
199 extern struct crush_choose_arg *crush_make_choose_args(struct crush_map *map, int num_positions);
200 extern void crush_destroy_choose_args(struct crush_choose_arg *args);
201 /** @ingroup API
202 *
203 * Add __item__ to __bucket__ with __weight__. The weight of the new
204 * item is added to the weight of the bucket so that it reflects
205 * the total weight of all items.
206 *
207 * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM, the value of __weight__ must be equal to
208 * __(struct crush_bucket_uniform *)bucket->item_weight__.
209 *
210 * - return -ENOMEM if the __bucket__ cannot be resized with __realloc(3)__.
211 * - return -ERANGE if adding __weight__ to the weight of the bucket overflows.
212 * - return -EINVAL if __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM and
213 * the __weight__ is not equal to __(struct crush_bucket_uniform *)bucket->item_weight__.
214 * - return -1 if the value of __bucket->alg__ is unknown.
215 *
216 * @returns 0 on success, < 0 on error
217 */
218 extern int crush_bucket_add_item(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
219 /** @ingroup API
220 *
221 * If __bucket->alg__ is ::CRUSH_BUCKET_UNIFORM,
222 * __(struct crush_bucket_uniform *)bucket->item_weight__ is set to __weight__ and the
223 * weight of the bucket is set to be the number of items in the bucket times the weight.
224 * The return value is the difference between the new bucket weight and the former
225 * bucket weight. The __item__ argument is ignored.
226 *
227 * If __bucket->alg__ is different from ::CRUSH_BUCKET_UNIFORM,
228 * set the __weight__ of __item__ in __bucket__. The former weight of the
229 * item is subtracted from the weight of the bucket and the new weight is added.
230 * The return value is the difference between the new item weight and the former
231 * item weight.
232 *
233 * @returns the difference between the new weight and the former weight
234 */
235 extern int crush_bucket_adjust_item_weight(struct crush_map *map, struct crush_bucket *bucket, int item, int weight);
236 /** @ingroup API
237 *
238 * Recursively update the weight of __bucket__ and its children, deep
239 * first. The __bucket__ weight is set to the sum of the weight of the
240 * items it contains.
241 *
242 * - return -ERANGE if the sum of the weight of the items in __bucket__ overflows.
243 * - return -1 if the value of __bucket->alg__ is unknown.
244 *
245 * @param map a crush_map containing __bucket__
246 * @param bucket the root of the tree to reweight
247 * @returns 0 on success, < 0 on error
248 */
249 extern int crush_reweight_bucket(struct crush_map *map, struct crush_bucket *bucket);
250 /** @ingroup API
251 *
252 * Remove __bucket__ from __map__ and deallocate it via crush_destroy_bucket().
253 * __assert(3)__ that __bucket__ is in __map__. The caller is responsible for
254 * making sure the bucket is not the child of any other bucket in the __map__.
255 *
256 * @param map a crush_map containing __bucket__
257 * @param bucket the bucket to remove from __map__
258 * @returns 0
259 */
260 extern int crush_remove_bucket(struct crush_map *map, struct crush_bucket *bucket);
261 /** @ingroup API
262 *
263 * Remove __item__ from __bucket__ and subtract the item weight from
264 * the bucket weight. If the weight of the item is greater than the
265 * weight of the bucket, silentely set the bucket weight to zero.
266 *
267 * - return -ENOMEM if the __bucket__ cannot be sized down with __realloc(3)__.
268 * - return -1 if the value of __bucket->alg__ is unknown.
269 *
270 * @param map __unused__
271 * @param bucket the bucket from which __item__ is removed
272 * @param item the item to remove from __bucket__
273 * @returns 0 on success, < 0 on error
274 */
275 extern int crush_bucket_remove_item(struct crush_map *map, struct crush_bucket *bucket, int item);
276
277 struct crush_bucket_uniform *
278 crush_make_uniform_bucket(int hash, int type, int size,
279 int *items,
280 int item_weight);
281 struct crush_bucket_list*
282 crush_make_list_bucket(int hash, int type, int size,
283 int *items,
284 int *weights);
285 struct crush_bucket_tree*
286 crush_make_tree_bucket(int hash, int type, int size,
287 int *items, /* in leaf order */
288 int *weights);
289 struct crush_bucket_straw *
290 crush_make_straw_bucket(struct crush_map *map,
291 int hash, int type, int size,
292 int *items,
293 int *weights);
294
295 extern int crush_addition_is_unsafe(__u32 a, __u32 b);
296 extern int crush_multiplication_is_unsafe(__u32 a, __u32 b);
297
298 /** @ingroup API
299 *
300 * Set the __map__ tunables to implement the most ancient behavior,
301 * for backward compatibility purposes only.
302 *
303 * - choose_local_tries == 2
304 * - choose_local_fallback_tries == 5
305 * - choose_total_tries == 19
306 * - chooseleaf_descend_once == 0
307 * - chooseleaf_vary_r == 0
308 * - straw_calc_version == 0
309 * - chooseleaf_stable = 0
310 *
311 * See the __crush_map__ documentation for more information about
312 * each tunable.
313 *
314 * @param map a crush_map
315 */
316 extern void set_legacy_crush_map(struct crush_map *map);
317 /** @ingroup API
318 *
319 * Set the __map__ tunables to implement the optimal behavior. These
320 * are the values set by crush_create(). It does not guarantee a
321 * stable mapping after an upgrade.
322 *
323 * For instance when a bug is fixed it may significantly change the
324 * mapping. In that case a new tunable (say tunable_new) is added so
325 * the caller can control when the bug fix is activated. The
326 * set_optimal_crush_map() function will always set all tunables,
327 * including tunable_new, to fix all bugs even if it means changing
328 * the mapping. If the caller needs fine grained control on the
329 * tunables to upgrade to a new version without changing the mapping,
330 * it needs to set the __crush_map__ tunables individually.
331 *
332 * See the __crush_map__ documentation for more information about
333 * each tunable.
334 *
335 * @param map a crush_map
336 */
337 extern void set_optimal_crush_map(struct crush_map *map);
338
339 #endif