8 #include "crush/crush.h"
11 #define dprintk(args...) /* printf(args) */
13 #define BUG_ON(x) assert(!(x))
15 struct crush_map
*crush_create()
18 m
= malloc(sizeof(*m
));
21 memset(m
, 0, sizeof(*m
));
23 set_optimal_crush_map(m
);
28 * finalize should be called _after_ all buckets are added to the map.
30 void crush_finalize(struct crush_map
*map
)
35 /* Calculate the needed working space while we do other
36 finalization tasks. */
37 map
->working_size
= sizeof(struct crush_work
);
38 /* Space for the array of pointers to per-bucket workspace */
39 map
->working_size
+= map
->max_buckets
*
40 sizeof(struct crush_work_bucket
*);
42 /* calc max_devices */
44 for (b
=0; b
<map
->max_buckets
; b
++) {
45 if (map
->buckets
[b
] == 0)
47 for (i
=0; i
<map
->buckets
[b
]->size
; i
++)
48 if (map
->buckets
[b
]->items
[i
] >= map
->max_devices
)
49 map
->max_devices
= map
->buckets
[b
]->items
[i
] + 1;
51 switch (map
->buckets
[b
]->alg
) {
53 /* The base case, permutation variables and
54 the pointer to the permutation array. */
55 map
->working_size
+= sizeof(struct crush_work_bucket
);
58 /* Every bucket has a permutation array. */
59 map
->working_size
+= map
->buckets
[b
]->size
* sizeof(__u32
);
67 int crush_add_rule(struct crush_map
*map
, struct crush_rule
*rule
, int ruleno
)
72 for (r
=0; r
< map
->max_rules
; r
++)
73 if (map
->rules
[r
] == 0)
75 assert(r
< CRUSH_MAX_RULES
);
80 if (r
>= map
->max_rules
) {
83 void *_realloc
= NULL
;
84 if (map
->max_rules
+1 > CRUSH_MAX_RULES
)
86 oldsize
= map
->max_rules
;
88 if ((_realloc
= realloc(map
->rules
, map
->max_rules
* sizeof(map
->rules
[0]))) == NULL
) {
91 map
->rules
= _realloc
;
93 memset(map
->rules
+ oldsize
, 0, (map
->max_rules
-oldsize
) * sizeof(map
->rules
[0]));
101 struct crush_rule
*crush_make_rule(int len
, int ruleset
, int type
, int minsize
, int maxsize
)
103 struct crush_rule
*rule
;
104 rule
= malloc(crush_rule_size(len
));
108 rule
->mask
.ruleset
= ruleset
;
109 rule
->mask
.type
= type
;
110 rule
->mask
.min_size
= minsize
;
111 rule
->mask
.max_size
= maxsize
;
116 * be careful; this doesn't verify that the buffer you allocated is big enough!
118 void crush_rule_set_step(struct crush_rule
*rule
, int n
, int op
, int arg1
, int arg2
)
120 assert((__u32
)n
< rule
->len
);
121 rule
->steps
[n
].op
= op
;
122 rule
->steps
[n
].arg1
= arg1
;
123 rule
->steps
[n
].arg2
= arg2
;
128 int crush_get_next_bucket_id(struct crush_map
*map
)
131 for (pos
=0; pos
< map
->max_buckets
; pos
++)
132 if (map
->buckets
[pos
] == 0)
138 int crush_add_bucket(struct crush_map
*map
,
140 struct crush_bucket
*bucket
,
145 /* find a bucket id */
147 id
= crush_get_next_bucket_id(map
);
150 while (pos
>= map
->max_buckets
) {
152 int oldsize
= map
->max_buckets
;
153 if (map
->max_buckets
)
154 map
->max_buckets
*= 2;
156 map
->max_buckets
= 8;
157 void *_realloc
= NULL
;
158 if ((_realloc
= realloc(map
->buckets
, map
->max_buckets
* sizeof(map
->buckets
[0]))) == NULL
) {
161 map
->buckets
= _realloc
;
163 memset(map
->buckets
+ oldsize
, 0, (map
->max_buckets
-oldsize
) * sizeof(map
->buckets
[0]));
166 if (map
->buckets
[pos
] != 0) {
172 map
->buckets
[pos
] = bucket
;
174 if (idout
) *idout
= id
;
178 int crush_remove_bucket(struct crush_map
*map
, struct crush_bucket
*bucket
)
180 int pos
= -1 - bucket
->id
;
181 assert(pos
< map
->max_buckets
);
182 map
->buckets
[pos
] = NULL
;
183 crush_destroy_bucket(bucket
);
190 struct crush_bucket_uniform
*
191 crush_make_uniform_bucket(int hash
, int type
, int size
,
196 struct crush_bucket_uniform
*bucket
;
198 bucket
= malloc(sizeof(*bucket
));
201 memset(bucket
, 0, sizeof(*bucket
));
202 bucket
->h
.alg
= CRUSH_BUCKET_UNIFORM
;
203 bucket
->h
.hash
= hash
;
204 bucket
->h
.type
= type
;
205 bucket
->h
.size
= size
;
207 if (crush_multiplication_is_unsafe(size
, item_weight
))
210 bucket
->h
.weight
= size
* item_weight
;
211 bucket
->item_weight
= item_weight
;
212 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
214 if (!bucket
->h
.items
)
217 for (i
=0; i
<size
; i
++)
218 bucket
->h
.items
[i
] = items
[i
];
222 free(bucket
->h
.items
);
230 struct crush_bucket_list
*
231 crush_make_list_bucket(int hash
, int type
, int size
,
237 struct crush_bucket_list
*bucket
;
239 bucket
= malloc(sizeof(*bucket
));
242 memset(bucket
, 0, sizeof(*bucket
));
243 bucket
->h
.alg
= CRUSH_BUCKET_LIST
;
244 bucket
->h
.hash
= hash
;
245 bucket
->h
.type
= type
;
246 bucket
->h
.size
= size
;
248 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
249 if (!bucket
->h
.items
)
253 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
254 if (!bucket
->item_weights
)
256 bucket
->sum_weights
= malloc(sizeof(__u32
)*size
);
257 if (!bucket
->sum_weights
)
260 for (i
=0; i
<size
; i
++) {
261 bucket
->h
.items
[i
] = items
[i
];
262 bucket
->item_weights
[i
] = weights
[i
];
264 if (crush_addition_is_unsafe(w
, weights
[i
]))
268 bucket
->sum_weights
[i
] = w
;
269 /*dprintk("pos %d item %d weight %d sum %d\n",
270 i, items[i], weights[i], bucket->sum_weights[i]);*/
273 bucket
->h
.weight
= w
;
277 free(bucket
->sum_weights
);
278 free(bucket
->item_weights
);
279 free(bucket
->h
.items
);
287 static int height(int n
) {
289 while ((n
& 1) == 0) {
295 static int on_right(int n
, int h
) {
296 return n
& (1 << (h
+1));
298 static int parent(int n
)
307 static int calc_depth(int size
)
322 struct crush_bucket_tree
*
323 crush_make_tree_bucket(int hash
, int type
, int size
,
324 int *items
, /* in leaf order */
327 struct crush_bucket_tree
*bucket
;
332 bucket
= malloc(sizeof(*bucket
));
335 memset(bucket
, 0, sizeof(*bucket
));
336 bucket
->h
.alg
= CRUSH_BUCKET_TREE
;
337 bucket
->h
.hash
= hash
;
338 bucket
->h
.type
= type
;
339 bucket
->h
.size
= size
;
342 bucket
->h
.items
= NULL
;
343 bucket
->h
.weight
= 0;
344 bucket
->node_weights
= NULL
;
345 bucket
->num_nodes
= 0;
346 /* printf("size 0 depth 0 nodes 0\n"); */
350 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
351 if (!bucket
->h
.items
)
354 /* calc tree depth */
355 depth
= calc_depth(size
);
356 bucket
->num_nodes
= 1 << depth
;
357 dprintk("size %d depth %d nodes %d\n", size
, depth
, bucket
->num_nodes
);
359 bucket
->node_weights
= malloc(sizeof(__u32
)*bucket
->num_nodes
);
360 if (!bucket
->node_weights
)
363 memset(bucket
->h
.items
, 0, sizeof(__s32
)*bucket
->h
.size
);
364 memset(bucket
->node_weights
, 0, sizeof(__u32
)*bucket
->num_nodes
);
366 for (i
=0; i
<size
; i
++) {
367 bucket
->h
.items
[i
] = items
[i
];
368 node
= crush_calc_tree_node(i
);
369 dprintk("item %d node %d weight %d\n", i
, node
, weights
[i
]);
370 bucket
->node_weights
[node
] = weights
[i
];
372 if (crush_addition_is_unsafe(bucket
->h
.weight
, weights
[i
]))
375 bucket
->h
.weight
+= weights
[i
];
376 for (j
=1; j
<depth
; j
++) {
379 if (crush_addition_is_unsafe(bucket
->node_weights
[node
], weights
[i
]))
382 bucket
->node_weights
[node
] += weights
[i
];
383 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
386 BUG_ON(bucket
->node_weights
[bucket
->num_nodes
/2] != bucket
->h
.weight
);
390 free(bucket
->node_weights
);
391 free(bucket
->h
.items
);
401 * this code was written 8 years ago. i have a vague recollection of
402 * drawing boxes underneath bars of different lengths, where the bar
403 * length represented the probability/weight, and that there was some
404 * trial and error involved in arriving at this implementation.
405 * however, reading the code now after all this time, the intuition
406 * that motivated is lost on me. lame. my only excuse is that I now
407 * know that the approach is fundamentally flawed and am not
408 * particularly motivated to reconstruct the flawed reasoning.
410 * as best as i can remember, the idea is: sort the weights, and start
411 * with the smallest. arbitrarily scale it at 1.0 (16-bit fixed
412 * point). look at the next larger weight, and calculate the scaling
413 * factor for that straw based on the relative difference in weight so
414 * far. what's not clear to me now is why we are looking at wnext
415 * (the delta to the next bigger weight) for all remaining weights,
416 * and slicing things horizontally instead of considering just the
417 * next item or set of items. or why pow() is used the way it is.
419 * note that the original version 1 of this function made special
420 * accomodation for the case where straw lengths were identical. this
421 * is also flawed in a non-obvious way; version 2 drops the special
422 * handling and appears to work just as well.
424 * moral of the story: if you do something clever, write down why it
427 int crush_calc_straw(struct crush_map
*map
, struct crush_bucket_straw
*bucket
)
431 double straw
, wbelow
, lastw
, wnext
, pbelow
;
433 int size
= bucket
->h
.size
;
434 __u32
*weights
= bucket
->item_weights
;
436 /* reverse sort by weight (simple insertion sort) */
437 reverse
= malloc(sizeof(int) * size
);
442 for (i
=1; i
<size
; i
++) {
443 for (j
=0; j
<i
; j
++) {
444 if (weights
[i
] < weights
[reverse
[j
]]) {
447 reverse
[k
] = reverse
[k
-1];
463 if (map
->straw_calc_version
== 0) {
464 /* zero weight items get 0 length straws! */
465 if (weights
[reverse
[i
]] == 0) {
466 bucket
->straws
[reverse
[i
]] = 0;
471 /* set this item's straw */
472 bucket
->straws
[reverse
[i
]] = straw
* 0x10000;
473 dprintk("item %d at %d weight %d straw %d (%lf)\n",
474 bucket
->h
.items
[reverse
[i
]],
475 reverse
[i
], weights
[reverse
[i
]],
476 bucket
->straws
[reverse
[i
]], straw
);
481 /* same weight as previous? */
482 if (weights
[reverse
[i
]] == weights
[reverse
[i
-1]]) {
483 dprintk("same as previous\n");
487 /* adjust straw for next guy */
488 wbelow
+= ((double)weights
[reverse
[i
-1]] - lastw
) *
490 for (j
=i
; j
<size
; j
++)
491 if (weights
[reverse
[j
]] == weights
[reverse
[i
]])
495 wnext
= numleft
* (weights
[reverse
[i
]] -
496 weights
[reverse
[i
-1]]);
497 pbelow
= wbelow
/ (wbelow
+ wnext
);
498 dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
499 wbelow
, wnext
, pbelow
, numleft
);
501 straw
*= pow((double)1.0 / pbelow
, (double)1.0 /
504 lastw
= weights
[reverse
[i
-1]];
505 } else if (map
->straw_calc_version
>= 1) {
506 /* zero weight items get 0 length straws! */
507 if (weights
[reverse
[i
]] == 0) {
508 bucket
->straws
[reverse
[i
]] = 0;
514 /* set this item's straw */
515 bucket
->straws
[reverse
[i
]] = straw
* 0x10000;
516 dprintk("item %d at %d weight %d straw %d (%lf)\n",
517 bucket
->h
.items
[reverse
[i
]],
518 reverse
[i
], weights
[reverse
[i
]],
519 bucket
->straws
[reverse
[i
]], straw
);
524 /* adjust straw for next guy */
525 wbelow
+= ((double)weights
[reverse
[i
-1]] - lastw
) *
528 wnext
= numleft
* (weights
[reverse
[i
]] -
529 weights
[reverse
[i
-1]]);
530 pbelow
= wbelow
/ (wbelow
+ wnext
);
531 dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
532 wbelow
, wnext
, pbelow
, numleft
);
534 straw
*= pow((double)1.0 / pbelow
, (double)1.0 /
537 lastw
= weights
[reverse
[i
-1]];
545 struct crush_bucket_straw
*
546 crush_make_straw_bucket(struct crush_map
*map
,
553 struct crush_bucket_straw
*bucket
;
556 bucket
= malloc(sizeof(*bucket
));
559 memset(bucket
, 0, sizeof(*bucket
));
560 bucket
->h
.alg
= CRUSH_BUCKET_STRAW
;
561 bucket
->h
.hash
= hash
;
562 bucket
->h
.type
= type
;
563 bucket
->h
.size
= size
;
565 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
566 if (!bucket
->h
.items
)
568 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
569 if (!bucket
->item_weights
)
571 bucket
->straws
= malloc(sizeof(__u32
)*size
);
575 bucket
->h
.weight
= 0;
576 for (i
=0; i
<size
; i
++) {
577 bucket
->h
.items
[i
] = items
[i
];
578 bucket
->h
.weight
+= weights
[i
];
579 bucket
->item_weights
[i
] = weights
[i
];
582 if (crush_calc_straw(map
, bucket
) < 0)
587 free(bucket
->straws
);
588 free(bucket
->item_weights
);
589 free(bucket
->h
.items
);
594 struct crush_bucket_straw2
*
595 crush_make_straw2_bucket(struct crush_map
*map
,
602 struct crush_bucket_straw2
*bucket
;
605 bucket
= malloc(sizeof(*bucket
));
608 memset(bucket
, 0, sizeof(*bucket
));
609 bucket
->h
.alg
= CRUSH_BUCKET_STRAW2
;
610 bucket
->h
.hash
= hash
;
611 bucket
->h
.type
= type
;
612 bucket
->h
.size
= size
;
614 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
615 if (!bucket
->h
.items
)
617 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
618 if (!bucket
->item_weights
)
621 bucket
->h
.weight
= 0;
622 for (i
=0; i
<size
; i
++) {
623 bucket
->h
.items
[i
] = items
[i
];
624 bucket
->h
.weight
+= weights
[i
];
625 bucket
->item_weights
[i
] = weights
[i
];
630 free(bucket
->item_weights
);
631 free(bucket
->h
.items
);
639 crush_make_bucket(struct crush_map
*map
,
640 int alg
, int hash
, int type
, int size
,
647 case CRUSH_BUCKET_UNIFORM
:
649 item_weight
= weights
[0];
652 return (struct crush_bucket
*)crush_make_uniform_bucket(hash
, type
, size
, items
, item_weight
);
654 case CRUSH_BUCKET_LIST
:
655 return (struct crush_bucket
*)crush_make_list_bucket(hash
, type
, size
, items
, weights
);
657 case CRUSH_BUCKET_TREE
:
658 return (struct crush_bucket
*)crush_make_tree_bucket(hash
, type
, size
, items
, weights
);
660 case CRUSH_BUCKET_STRAW
:
661 return (struct crush_bucket
*)crush_make_straw_bucket(map
, hash
, type
, size
, items
, weights
);
662 case CRUSH_BUCKET_STRAW2
:
663 return (struct crush_bucket
*)crush_make_straw2_bucket(map
, hash
, type
, size
, items
, weights
);
669 /************************************************/
671 int crush_add_uniform_bucket_item(struct crush_bucket_uniform
*bucket
, int item
, int weight
)
673 int newsize
= bucket
->h
.size
+ 1;
674 void *_realloc
= NULL
;
676 /* In such situation 'CRUSH_BUCKET_UNIFORM', the weight
677 provided for the item should be the same as
678 bucket->item_weight defined with 'crush_make_bucket'. This
679 assumption is enforced by the return value which is always
681 if (bucket
->item_weight
!= weight
) {
685 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
688 bucket
->h
.items
= _realloc
;
691 bucket
->h
.items
[newsize
-1] = item
;
693 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
696 bucket
->h
.weight
+= weight
;
702 int crush_add_list_bucket_item(struct crush_bucket_list
*bucket
, int item
, int weight
)
704 int newsize
= bucket
->h
.size
+ 1;
705 void *_realloc
= NULL
;
707 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
710 bucket
->h
.items
= _realloc
;
712 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
715 bucket
->item_weights
= _realloc
;
717 if ((_realloc
= realloc(bucket
->sum_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
720 bucket
->sum_weights
= _realloc
;
723 bucket
->h
.items
[newsize
-1] = item
;
724 bucket
->item_weights
[newsize
-1] = weight
;
727 if (crush_addition_is_unsafe(bucket
->sum_weights
[newsize
-2], weight
))
730 bucket
->sum_weights
[newsize
-1] = bucket
->sum_weights
[newsize
-2] + weight
;
734 bucket
->sum_weights
[newsize
-1] = weight
;
737 bucket
->h
.weight
+= weight
;
742 int crush_add_tree_bucket_item(struct crush_bucket_tree
*bucket
, int item
, int weight
)
744 int newsize
= bucket
->h
.size
+ 1;
745 int depth
= calc_depth(newsize
);;
748 void *_realloc
= NULL
;
750 bucket
->num_nodes
= 1 << depth
;
752 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
755 bucket
->h
.items
= _realloc
;
757 if ((_realloc
= realloc(bucket
->node_weights
, sizeof(__u32
)*bucket
->num_nodes
)) == NULL
) {
760 bucket
->node_weights
= _realloc
;
763 node
= crush_calc_tree_node(newsize
-1);
764 bucket
->node_weights
[node
] = weight
;
766 /* if the depth increase, we need to initialize the new root node's weight before add bucket item */
767 int root
= bucket
->num_nodes
/2;
768 if (depth
>= 2 && (node
- 1) == root
) {
769 /* if the new item is the first node in right sub tree, so
770 * the root node initial weight is left sub tree's weight
772 bucket
->node_weights
[root
] = bucket
->node_weights
[root
/2];
775 for (j
=1; j
<depth
; j
++) {
778 if (crush_addition_is_unsafe(bucket
->node_weights
[node
], weight
))
781 bucket
->node_weights
[node
] += weight
;
782 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
786 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
789 bucket
->h
.items
[newsize
-1] = item
;
790 bucket
->h
.weight
+= weight
;
796 int crush_add_straw_bucket_item(struct crush_map
*map
,
797 struct crush_bucket_straw
*bucket
,
798 int item
, int weight
)
800 int newsize
= bucket
->h
.size
+ 1;
802 void *_realloc
= NULL
;
804 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
807 bucket
->h
.items
= _realloc
;
809 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
812 bucket
->item_weights
= _realloc
;
814 if ((_realloc
= realloc(bucket
->straws
, sizeof(__u32
)*newsize
)) == NULL
) {
817 bucket
->straws
= _realloc
;
820 bucket
->h
.items
[newsize
-1] = item
;
821 bucket
->item_weights
[newsize
-1] = weight
;
823 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
826 bucket
->h
.weight
+= weight
;
829 return crush_calc_straw(map
, bucket
);
832 int crush_add_straw2_bucket_item(struct crush_map
*map
,
833 struct crush_bucket_straw2
*bucket
,
834 int item
, int weight
)
836 int newsize
= bucket
->h
.size
+ 1;
838 void *_realloc
= NULL
;
840 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
843 bucket
->h
.items
= _realloc
;
845 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
848 bucket
->item_weights
= _realloc
;
851 bucket
->h
.items
[newsize
-1] = item
;
852 bucket
->item_weights
[newsize
-1] = weight
;
854 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
857 bucket
->h
.weight
+= weight
;
863 int crush_bucket_add_item(struct crush_map
*map
,
864 struct crush_bucket
*b
, int item
, int weight
)
867 case CRUSH_BUCKET_UNIFORM
:
868 return crush_add_uniform_bucket_item((struct crush_bucket_uniform
*)b
, item
, weight
);
869 case CRUSH_BUCKET_LIST
:
870 return crush_add_list_bucket_item((struct crush_bucket_list
*)b
, item
, weight
);
871 case CRUSH_BUCKET_TREE
:
872 return crush_add_tree_bucket_item((struct crush_bucket_tree
*)b
, item
, weight
);
873 case CRUSH_BUCKET_STRAW
:
874 return crush_add_straw_bucket_item(map
, (struct crush_bucket_straw
*)b
, item
, weight
);
875 case CRUSH_BUCKET_STRAW2
:
876 return crush_add_straw2_bucket_item(map
, (struct crush_bucket_straw2
*)b
, item
, weight
);
882 /************************************************/
884 int crush_remove_uniform_bucket_item(struct crush_bucket_uniform
*bucket
, int item
)
888 void *_realloc
= NULL
;
890 for (i
= 0; i
< bucket
->h
.size
; i
++)
891 if (bucket
->h
.items
[i
] == item
)
893 if (i
== bucket
->h
.size
)
896 for (j
= i
; j
< bucket
->h
.size
; j
++)
897 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
898 newsize
= --bucket
->h
.size
;
899 if (bucket
->item_weight
< bucket
->h
.weight
)
900 bucket
->h
.weight
-= bucket
->item_weight
;
902 bucket
->h
.weight
= 0;
904 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
907 bucket
->h
.items
= _realloc
;
912 int crush_remove_list_bucket_item(struct crush_bucket_list
*bucket
, int item
)
918 for (i
= 0; i
< bucket
->h
.size
; i
++)
919 if (bucket
->h
.items
[i
] == item
)
921 if (i
== bucket
->h
.size
)
924 weight
= bucket
->item_weights
[i
];
925 for (j
= i
; j
< bucket
->h
.size
; j
++) {
926 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
927 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
928 bucket
->sum_weights
[j
] = bucket
->sum_weights
[j
+1] - weight
;
930 if (weight
< bucket
->h
.weight
)
931 bucket
->h
.weight
-= weight
;
933 bucket
->h
.weight
= 0;
934 newsize
= --bucket
->h
.size
;
936 void *_realloc
= NULL
;
938 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
941 bucket
->h
.items
= _realloc
;
943 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
946 bucket
->item_weights
= _realloc
;
948 if ((_realloc
= realloc(bucket
->sum_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
951 bucket
->sum_weights
= _realloc
;
956 int crush_remove_tree_bucket_item(struct crush_bucket_tree
*bucket
, int item
)
961 for (i
= 0; i
< bucket
->h
.size
; i
++) {
965 int depth
= calc_depth(bucket
->h
.size
);
967 if (bucket
->h
.items
[i
] != item
)
970 bucket
->h
.items
[i
] = 0;
971 node
= crush_calc_tree_node(i
);
972 weight
= bucket
->node_weights
[node
];
973 bucket
->node_weights
[node
] = 0;
975 for (j
= 1; j
< depth
; j
++) {
977 bucket
->node_weights
[node
] -= weight
;
978 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
980 if (weight
< bucket
->h
.weight
)
981 bucket
->h
.weight
-= weight
;
983 bucket
->h
.weight
= 0;
986 if (i
== bucket
->h
.size
)
989 newsize
= bucket
->h
.size
;
990 while (newsize
> 0) {
991 int node
= crush_calc_tree_node(newsize
- 1);
992 if (bucket
->node_weights
[node
])
997 if (newsize
!= bucket
->h
.size
) {
998 int olddepth
, newdepth
;
1000 void *_realloc
= NULL
;
1002 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1005 bucket
->h
.items
= _realloc
;
1008 olddepth
= calc_depth(bucket
->h
.size
);
1009 newdepth
= calc_depth(newsize
);
1010 if (olddepth
!= newdepth
) {
1011 bucket
->num_nodes
= 1 << newdepth
;
1012 if ((_realloc
= realloc(bucket
->node_weights
,
1013 sizeof(__u32
)*bucket
->num_nodes
)) == NULL
) {
1016 bucket
->node_weights
= _realloc
;
1020 bucket
->h
.size
= newsize
;
1025 int crush_remove_straw_bucket_item(struct crush_map
*map
,
1026 struct crush_bucket_straw
*bucket
, int item
)
1028 int newsize
= bucket
->h
.size
- 1;
1031 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1032 if (bucket
->h
.items
[i
] == item
) {
1034 if (bucket
->item_weights
[i
] < bucket
->h
.weight
)
1035 bucket
->h
.weight
-= bucket
->item_weights
[i
];
1037 bucket
->h
.weight
= 0;
1038 for (j
= i
; j
< bucket
->h
.size
; j
++) {
1039 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
1040 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
1045 if (i
== bucket
->h
.size
)
1048 void *_realloc
= NULL
;
1050 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1053 bucket
->h
.items
= _realloc
;
1055 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
1058 bucket
->item_weights
= _realloc
;
1060 if ((_realloc
= realloc(bucket
->straws
, sizeof(__u32
)*newsize
)) == NULL
) {
1063 bucket
->straws
= _realloc
;
1066 return crush_calc_straw(map
, bucket
);
1069 int crush_remove_straw2_bucket_item(struct crush_map
*map
,
1070 struct crush_bucket_straw2
*bucket
, int item
)
1072 int newsize
= bucket
->h
.size
- 1;
1075 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1076 if (bucket
->h
.items
[i
] == item
) {
1078 if (bucket
->item_weights
[i
] < bucket
->h
.weight
)
1079 bucket
->h
.weight
-= bucket
->item_weights
[i
];
1081 bucket
->h
.weight
= 0;
1082 for (j
= i
; j
< bucket
->h
.size
; j
++) {
1083 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
1084 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
1089 if (i
== bucket
->h
.size
)
1092 void *_realloc
= NULL
;
1094 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1097 bucket
->h
.items
= _realloc
;
1099 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
1102 bucket
->item_weights
= _realloc
;
1108 int crush_bucket_remove_item(struct crush_map
*map
, struct crush_bucket
*b
, int item
)
1111 case CRUSH_BUCKET_UNIFORM
:
1112 return crush_remove_uniform_bucket_item((struct crush_bucket_uniform
*)b
, item
);
1113 case CRUSH_BUCKET_LIST
:
1114 return crush_remove_list_bucket_item((struct crush_bucket_list
*)b
, item
);
1115 case CRUSH_BUCKET_TREE
:
1116 return crush_remove_tree_bucket_item((struct crush_bucket_tree
*)b
, item
);
1117 case CRUSH_BUCKET_STRAW
:
1118 return crush_remove_straw_bucket_item(map
, (struct crush_bucket_straw
*)b
, item
);
1119 case CRUSH_BUCKET_STRAW2
:
1120 return crush_remove_straw2_bucket_item(map
, (struct crush_bucket_straw2
*)b
, item
);
1127 /************************************************/
1129 int crush_adjust_uniform_bucket_item_weight(struct crush_bucket_uniform
*bucket
, int item
, int weight
)
1131 int diff
= (weight
- bucket
->item_weight
) * bucket
->h
.size
;
1133 bucket
->item_weight
= weight
;
1134 bucket
->h
.weight
= bucket
->item_weight
* bucket
->h
.size
;
1139 int crush_adjust_list_bucket_item_weight(struct crush_bucket_list
*bucket
, int item
, int weight
)
1144 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1145 if (bucket
->h
.items
[i
] == item
)
1148 if (i
== bucket
->h
.size
)
1151 diff
= weight
- bucket
->item_weights
[i
];
1152 bucket
->item_weights
[i
] = weight
;
1153 bucket
->h
.weight
+= diff
;
1155 for (j
= i
; j
< bucket
->h
.size
; j
++)
1156 bucket
->sum_weights
[j
] += diff
;
1161 int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree
*bucket
, int item
, int weight
)
1166 unsigned depth
= calc_depth(bucket
->h
.size
);
1168 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1169 if (bucket
->h
.items
[i
] == item
)
1172 if (i
== bucket
->h
.size
)
1175 node
= crush_calc_tree_node(i
);
1176 diff
= weight
- bucket
->node_weights
[node
];
1177 bucket
->node_weights
[node
] = weight
;
1178 bucket
->h
.weight
+= diff
;
1180 for (j
=1; j
<depth
; j
++) {
1181 node
= parent(node
);
1182 bucket
->node_weights
[node
] += diff
;
1188 int crush_adjust_straw_bucket_item_weight(struct crush_map
*map
,
1189 struct crush_bucket_straw
*bucket
,
1190 int item
, int weight
)
1196 for (idx
= 0; idx
< bucket
->h
.size
; idx
++)
1197 if (bucket
->h
.items
[idx
] == item
)
1199 if (idx
== bucket
->h
.size
)
1202 diff
= weight
- bucket
->item_weights
[idx
];
1203 bucket
->item_weights
[idx
] = weight
;
1204 bucket
->h
.weight
+= diff
;
1206 r
= crush_calc_straw(map
, bucket
);
1213 int crush_adjust_straw2_bucket_item_weight(struct crush_map
*map
,
1214 struct crush_bucket_straw2
*bucket
,
1215 int item
, int weight
)
1220 for (idx
= 0; idx
< bucket
->h
.size
; idx
++)
1221 if (bucket
->h
.items
[idx
] == item
)
1223 if (idx
== bucket
->h
.size
)
1226 diff
= weight
- bucket
->item_weights
[idx
];
1227 bucket
->item_weights
[idx
] = weight
;
1228 bucket
->h
.weight
+= diff
;
1233 int crush_bucket_adjust_item_weight(struct crush_map
*map
,
1234 struct crush_bucket
*b
,
1235 int item
, int weight
)
1238 case CRUSH_BUCKET_UNIFORM
:
1239 return crush_adjust_uniform_bucket_item_weight((struct crush_bucket_uniform
*)b
,
1241 case CRUSH_BUCKET_LIST
:
1242 return crush_adjust_list_bucket_item_weight((struct crush_bucket_list
*)b
,
1244 case CRUSH_BUCKET_TREE
:
1245 return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree
*)b
,
1247 case CRUSH_BUCKET_STRAW
:
1248 return crush_adjust_straw_bucket_item_weight(map
,
1249 (struct crush_bucket_straw
*)b
,
1251 case CRUSH_BUCKET_STRAW2
:
1252 return crush_adjust_straw2_bucket_item_weight(map
,
1253 (struct crush_bucket_straw2
*)b
,
1260 /************************************************/
1262 static int crush_reweight_uniform_bucket(struct crush_map
*map
, struct crush_bucket_uniform
*bucket
)
1265 unsigned sum
= 0, n
= 0, leaves
= 0;
1267 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1268 int id
= bucket
->h
.items
[i
];
1270 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1271 crush_reweight_bucket(map
, c
);
1273 if (crush_addition_is_unsafe(sum
, c
->weight
))
1284 bucket
->item_weight
= sum
/ n
; // more bucket children than leaves, average!
1285 bucket
->h
.weight
= bucket
->item_weight
* bucket
->h
.size
;
1290 static int crush_reweight_list_bucket(struct crush_map
*map
, struct crush_bucket_list
*bucket
)
1294 bucket
->h
.weight
= 0;
1295 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1296 int id
= bucket
->h
.items
[i
];
1298 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1299 crush_reweight_bucket(map
, c
);
1300 bucket
->item_weights
[i
] = c
->weight
;
1303 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1306 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1312 static int crush_reweight_tree_bucket(struct crush_map
*map
, struct crush_bucket_tree
*bucket
)
1316 bucket
->h
.weight
= 0;
1317 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1318 int node
= crush_calc_tree_node(i
);
1319 int id
= bucket
->h
.items
[i
];
1321 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1322 crush_reweight_bucket(map
, c
);
1323 bucket
->node_weights
[node
] = c
->weight
;
1326 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->node_weights
[node
]))
1329 bucket
->h
.weight
+= bucket
->node_weights
[node
];
1337 static int crush_reweight_straw_bucket(struct crush_map
*map
, struct crush_bucket_straw
*bucket
)
1341 bucket
->h
.weight
= 0;
1342 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1343 int id
= bucket
->h
.items
[i
];
1345 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1346 crush_reweight_bucket(map
, c
);
1347 bucket
->item_weights
[i
] = c
->weight
;
1350 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1353 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1355 crush_calc_straw(map
, bucket
);
1360 static int crush_reweight_straw2_bucket(struct crush_map
*map
, struct crush_bucket_straw2
*bucket
)
1364 bucket
->h
.weight
= 0;
1365 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1366 int id
= bucket
->h
.items
[i
];
1368 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1369 crush_reweight_bucket(map
, c
);
1370 bucket
->item_weights
[i
] = c
->weight
;
1373 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1376 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1382 int crush_reweight_bucket(struct crush_map
*map
, struct crush_bucket
*b
)
1385 case CRUSH_BUCKET_UNIFORM
:
1386 return crush_reweight_uniform_bucket(map
, (struct crush_bucket_uniform
*)b
);
1387 case CRUSH_BUCKET_LIST
:
1388 return crush_reweight_list_bucket(map
, (struct crush_bucket_list
*)b
);
1389 case CRUSH_BUCKET_TREE
:
1390 return crush_reweight_tree_bucket(map
, (struct crush_bucket_tree
*)b
);
1391 case CRUSH_BUCKET_STRAW
:
1392 return crush_reweight_straw_bucket(map
, (struct crush_bucket_straw
*)b
);
1393 case CRUSH_BUCKET_STRAW2
:
1394 return crush_reweight_straw2_bucket(map
, (struct crush_bucket_straw2
*)b
);
1400 struct crush_choose_arg
*crush_make_choose_args(struct crush_map
*map
, int num_positions
)
1403 int sum_bucket_size
= 0;
1404 int bucket_count
= 0;
1405 for (b
= 0; b
< map
->max_buckets
; b
++) {
1406 if (map
->buckets
[b
] == 0)
1408 sum_bucket_size
+= map
->buckets
[b
]->size
;
1411 dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n",
1412 sum_bucket_size
, map
->max_buckets
, bucket_count
);
1413 int size
= (sizeof(struct crush_choose_arg
) * map
->max_buckets
+
1414 sizeof(struct crush_weight_set
) * bucket_count
* num_positions
+
1415 sizeof(__u32
) * sum_bucket_size
* num_positions
+ // weights
1416 sizeof(__s32
) * sum_bucket_size
); // ids
1417 char *space
= malloc(size
);
1418 struct crush_choose_arg
*arg
= (struct crush_choose_arg
*)space
;
1419 struct crush_weight_set
*weight_set
= (struct crush_weight_set
*)(arg
+ map
->max_buckets
);
1420 __u32
*weights
= (__u32
*)(weight_set
+ bucket_count
* num_positions
);
1421 char *weight_set_ends
= (char*)weights
;
1422 __s32
*ids
= (__s32
*)(weights
+ sum_bucket_size
* num_positions
);
1423 char *weights_end
= (char *)ids
;
1424 char *ids_end
= (char *)(ids
+ sum_bucket_size
);
1425 BUG_ON(space
+ size
!= ids_end
);
1426 for (b
= 0; b
< map
->max_buckets
; b
++) {
1427 if (map
->buckets
[b
] == 0) {
1428 memset(&arg
[b
], '\0', sizeof(struct crush_choose_arg
));
1431 struct crush_bucket_straw2
*bucket
= (struct crush_bucket_straw2
*)map
->buckets
[b
];
1434 for (position
= 0; position
< num_positions
; position
++) {
1435 memcpy(weights
, bucket
->item_weights
, sizeof(__u32
) * bucket
->h
.size
);
1436 weight_set
[position
].weights
= weights
;
1437 weight_set
[position
].size
= bucket
->h
.size
;
1438 dprintk("moving weight %d bytes forward\n", (int)((weights
+ bucket
->h
.size
) - weights
));
1439 weights
+= bucket
->h
.size
;
1441 arg
[b
].weight_set
= weight_set
;
1442 arg
[b
].weight_set_size
= num_positions
;
1443 weight_set
+= position
;
1445 memcpy(ids
, bucket
->h
.items
, sizeof(__s32
) * bucket
->h
.size
);
1447 arg
[b
].ids_size
= bucket
->h
.size
;
1448 ids
+= bucket
->h
.size
;
1450 BUG_ON((char*)weight_set_ends
!= (char*)weight_set
);
1451 BUG_ON((char*)weights_end
!= (char*)weights
);
1452 BUG_ON((char*)ids
!= (char*)ids_end
);
1456 void crush_destroy_choose_args(struct crush_choose_arg
*args
)
1461 /***************************/
1463 /* methods to check for safe arithmetic operations */
1465 int crush_addition_is_unsafe(__u32 a
, __u32 b
)
1467 if ((((__u32
)(-1)) - b
) < a
)
1473 int crush_multiplication_is_unsafe(__u32 a
, __u32 b
)
1475 /* prevent division by zero */
1480 if ((((__u32
)(-1)) / b
) < a
)
1486 /***************************/
1488 /* methods to configure crush_map */
1490 void set_legacy_crush_map(struct crush_map
*map
) {
1491 /* initialize legacy tunable values */
1492 map
->choose_local_tries
= 2;
1493 map
->choose_local_fallback_tries
= 5;
1494 map
->choose_total_tries
= 19;
1495 map
->chooseleaf_descend_once
= 0;
1496 map
->chooseleaf_vary_r
= 0;
1497 map
->chooseleaf_stable
= 0;
1498 map
->straw_calc_version
= 0;
1500 // by default, use legacy types, and also exclude tree,
1501 // since it was buggy.
1502 map
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
1505 void set_optimal_crush_map(struct crush_map
*map
) {
1506 map
->choose_local_tries
= 0;
1507 map
->choose_local_fallback_tries
= 0;
1508 map
->choose_total_tries
= 50;
1509 map
->chooseleaf_descend_once
= 1;
1510 map
->chooseleaf_vary_r
= 1;
1511 map
->chooseleaf_stable
= 1;
1512 map
->allowed_bucket_algs
= (
1513 (1 << CRUSH_BUCKET_UNIFORM
) |
1514 (1 << CRUSH_BUCKET_LIST
) |
1515 (1 << CRUSH_BUCKET_STRAW
) |
1516 (1 << CRUSH_BUCKET_STRAW2
));