8 #include "crush/crush.h"
11 #define dprintk(args...) /* printf(args) */
13 #define BUG_ON(x) assert(!(x))
15 struct crush_map
*crush_create()
18 m
= malloc(sizeof(*m
));
21 memset(m
, 0, sizeof(*m
));
23 set_optimal_crush_map(m
);
28 * finalize should be called _after_ all buckets are added to the map.
30 void crush_finalize(struct crush_map
*map
)
35 /* Calculate the needed working space while we do other
36 finalization tasks. */
37 map
->working_size
= sizeof(struct crush_work
);
38 /* Space for the array of pointers to per-bucket workspace */
39 map
->working_size
+= map
->max_buckets
*
40 sizeof(struct crush_work_bucket
*);
42 /* calc max_devices */
44 for (b
=0; b
<map
->max_buckets
; b
++) {
45 if (map
->buckets
[b
] == 0)
47 for (i
=0; i
<map
->buckets
[b
]->size
; i
++)
48 if (map
->buckets
[b
]->items
[i
] >= map
->max_devices
)
49 map
->max_devices
= map
->buckets
[b
]->items
[i
] + 1;
51 switch (map
->buckets
[b
]->alg
) {
53 /* The base case, permutation variables and
54 the pointer to the permutation array. */
55 map
->working_size
+= sizeof(struct crush_work_bucket
);
58 /* Every bucket has a permutation array. */
59 map
->working_size
+= map
->buckets
[b
]->size
* sizeof(__u32
);
67 int crush_add_rule(struct crush_map
*map
, struct crush_rule
*rule
, int ruleno
)
72 for (r
=0; r
< map
->max_rules
; r
++)
73 if (map
->rules
[r
] == 0)
75 assert(r
< CRUSH_MAX_RULES
);
80 if (r
>= map
->max_rules
) {
83 void *_realloc
= NULL
;
84 if (map
->max_rules
+1 > CRUSH_MAX_RULES
)
86 oldsize
= map
->max_rules
;
88 if ((_realloc
= realloc(map
->rules
, map
->max_rules
* sizeof(map
->rules
[0]))) == NULL
) {
91 map
->rules
= _realloc
;
93 memset(map
->rules
+ oldsize
, 0, (map
->max_rules
-oldsize
) * sizeof(map
->rules
[0]));
101 struct crush_rule
*crush_make_rule(int len
, int ruleset
, int type
, int minsize
, int maxsize
)
103 struct crush_rule
*rule
;
104 rule
= malloc(crush_rule_size(len
));
108 rule
->mask
.ruleset
= ruleset
;
109 rule
->mask
.type
= type
;
110 rule
->mask
.min_size
= minsize
;
111 rule
->mask
.max_size
= maxsize
;
116 * be careful; this doesn't verify that the buffer you allocated is big enough!
118 void crush_rule_set_step(struct crush_rule
*rule
, int n
, int op
, int arg1
, int arg2
)
120 assert((__u32
)n
< rule
->len
);
121 rule
->steps
[n
].op
= op
;
122 rule
->steps
[n
].arg1
= arg1
;
123 rule
->steps
[n
].arg2
= arg2
;
128 int crush_get_next_bucket_id(struct crush_map
*map
)
131 for (pos
=0; pos
< map
->max_buckets
; pos
++)
132 if (map
->buckets
[pos
] == 0)
138 int crush_add_bucket(struct crush_map
*map
,
140 struct crush_bucket
*bucket
,
145 /* find a bucket id */
147 id
= crush_get_next_bucket_id(map
);
150 while (pos
>= map
->max_buckets
) {
152 int oldsize
= map
->max_buckets
;
153 if (map
->max_buckets
)
154 map
->max_buckets
*= 2;
156 map
->max_buckets
= 8;
157 void *_realloc
= NULL
;
158 if ((_realloc
= realloc(map
->buckets
, map
->max_buckets
* sizeof(map
->buckets
[0]))) == NULL
) {
161 map
->buckets
= _realloc
;
163 memset(map
->buckets
+ oldsize
, 0, (map
->max_buckets
-oldsize
) * sizeof(map
->buckets
[0]));
166 if (map
->buckets
[pos
] != 0) {
172 map
->buckets
[pos
] = bucket
;
174 if (idout
) *idout
= id
;
178 int crush_remove_bucket(struct crush_map
*map
, struct crush_bucket
*bucket
)
180 int pos
= -1 - bucket
->id
;
181 assert(pos
< map
->max_buckets
);
182 map
->buckets
[pos
] = NULL
;
183 crush_destroy_bucket(bucket
);
190 struct crush_bucket_uniform
*
191 crush_make_uniform_bucket(int hash
, int type
, int size
,
196 struct crush_bucket_uniform
*bucket
;
198 bucket
= malloc(sizeof(*bucket
));
201 memset(bucket
, 0, sizeof(*bucket
));
202 bucket
->h
.alg
= CRUSH_BUCKET_UNIFORM
;
203 bucket
->h
.hash
= hash
;
204 bucket
->h
.type
= type
;
205 bucket
->h
.size
= size
;
207 if (crush_multiplication_is_unsafe(size
, item_weight
))
210 bucket
->h
.weight
= size
* item_weight
;
211 bucket
->item_weight
= item_weight
;
212 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
214 if (!bucket
->h
.items
)
217 for (i
=0; i
<size
; i
++)
218 bucket
->h
.items
[i
] = items
[i
];
222 free(bucket
->h
.items
);
230 struct crush_bucket_list
*
231 crush_make_list_bucket(int hash
, int type
, int size
,
237 struct crush_bucket_list
*bucket
;
239 bucket
= malloc(sizeof(*bucket
));
242 memset(bucket
, 0, sizeof(*bucket
));
243 bucket
->h
.alg
= CRUSH_BUCKET_LIST
;
244 bucket
->h
.hash
= hash
;
245 bucket
->h
.type
= type
;
246 bucket
->h
.size
= size
;
248 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
249 if (!bucket
->h
.items
)
253 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
254 if (!bucket
->item_weights
)
256 bucket
->sum_weights
= malloc(sizeof(__u32
)*size
);
257 if (!bucket
->sum_weights
)
260 for (i
=0; i
<size
; i
++) {
261 bucket
->h
.items
[i
] = items
[i
];
262 bucket
->item_weights
[i
] = weights
[i
];
264 if (crush_addition_is_unsafe(w
, weights
[i
]))
268 bucket
->sum_weights
[i
] = w
;
269 /*dprintk("pos %d item %d weight %d sum %d\n",
270 i, items[i], weights[i], bucket->sum_weights[i]);*/
273 bucket
->h
.weight
= w
;
277 free(bucket
->sum_weights
);
278 free(bucket
->item_weights
);
279 free(bucket
->h
.items
);
287 static int height(int n
) {
289 while ((n
& 1) == 0) {
295 static int on_right(int n
, int h
) {
296 return n
& (1 << (h
+1));
298 static int parent(int n
)
307 static int calc_depth(int size
)
322 struct crush_bucket_tree
*
323 crush_make_tree_bucket(int hash
, int type
, int size
,
324 int *items
, /* in leaf order */
327 struct crush_bucket_tree
*bucket
;
332 bucket
= malloc(sizeof(*bucket
));
335 memset(bucket
, 0, sizeof(*bucket
));
336 bucket
->h
.alg
= CRUSH_BUCKET_TREE
;
337 bucket
->h
.hash
= hash
;
338 bucket
->h
.type
= type
;
339 bucket
->h
.size
= size
;
342 bucket
->h
.items
= NULL
;
343 bucket
->h
.weight
= 0;
344 bucket
->node_weights
= NULL
;
345 bucket
->num_nodes
= 0;
346 /* printf("size 0 depth 0 nodes 0\n"); */
350 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
351 if (!bucket
->h
.items
)
354 /* calc tree depth */
355 depth
= calc_depth(size
);
356 bucket
->num_nodes
= 1 << depth
;
357 dprintk("size %d depth %d nodes %d\n", size
, depth
, bucket
->num_nodes
);
359 bucket
->node_weights
= malloc(sizeof(__u32
)*bucket
->num_nodes
);
360 if (!bucket
->node_weights
)
363 memset(bucket
->h
.items
, 0, sizeof(__s32
)*bucket
->h
.size
);
364 memset(bucket
->node_weights
, 0, sizeof(__u32
)*bucket
->num_nodes
);
366 for (i
=0; i
<size
; i
++) {
367 bucket
->h
.items
[i
] = items
[i
];
368 node
= crush_calc_tree_node(i
);
369 dprintk("item %d node %d weight %d\n", i
, node
, weights
[i
]);
370 bucket
->node_weights
[node
] = weights
[i
];
372 if (crush_addition_is_unsafe(bucket
->h
.weight
, weights
[i
]))
375 bucket
->h
.weight
+= weights
[i
];
376 for (j
=1; j
<depth
; j
++) {
379 if (crush_addition_is_unsafe(bucket
->node_weights
[node
], weights
[i
]))
382 bucket
->node_weights
[node
] += weights
[i
];
383 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
386 BUG_ON(bucket
->node_weights
[bucket
->num_nodes
/2] != bucket
->h
.weight
);
390 free(bucket
->node_weights
);
391 free(bucket
->h
.items
);
401 * this code was written 8 years ago. i have a vague recollection of
402 * drawing boxes underneath bars of different lengths, where the bar
403 * length represented the probability/weight, and that there was some
404 * trial and error involved in arriving at this implementation.
405 * however, reading the code now after all this time, the intuition
406 * that motivated is lost on me. lame. my only excuse is that I now
407 * know that the approach is fundamentally flawed and am not
408 * particularly motivated to reconstruct the flawed reasoning.
410 * as best as i can remember, the idea is: sort the weights, and start
411 * with the smallest. arbitrarily scale it at 1.0 (16-bit fixed
412 * point). look at the next larger weight, and calculate the scaling
413 * factor for that straw based on the relative difference in weight so
414 * far. what's not clear to me now is why we are looking at wnext
415 * (the delta to the next bigger weight) for all remaining weights,
416 * and slicing things horizontally instead of considering just the
417 * next item or set of items. or why pow() is used the way it is.
419 * note that the original version 1 of this function made special
420 * accomodation for the case where straw lengths were identical. this
421 * is also flawed in a non-obvious way; version 2 drops the special
422 * handling and appears to work just as well.
424 * moral of the story: if you do something clever, write down why it
427 int crush_calc_straw(struct crush_map
*map
, struct crush_bucket_straw
*bucket
)
431 double straw
, wbelow
, lastw
, wnext
, pbelow
;
433 int size
= bucket
->h
.size
;
434 __u32
*weights
= bucket
->item_weights
;
436 /* reverse sort by weight (simple insertion sort) */
437 reverse
= malloc(sizeof(int) * size
);
442 for (i
=1; i
<size
; i
++) {
443 for (j
=0; j
<i
; j
++) {
444 if (weights
[i
] < weights
[reverse
[j
]]) {
447 reverse
[k
] = reverse
[k
-1];
463 if (map
->straw_calc_version
== 0) {
464 /* zero weight items get 0 length straws! */
465 if (weights
[reverse
[i
]] == 0) {
466 bucket
->straws
[reverse
[i
]] = 0;
471 /* set this item's straw */
472 bucket
->straws
[reverse
[i
]] = straw
* 0x10000;
473 dprintk("item %d at %d weight %d straw %d (%lf)\n",
474 bucket
->h
.items
[reverse
[i
]],
475 reverse
[i
], weights
[reverse
[i
]],
476 bucket
->straws
[reverse
[i
]], straw
);
481 /* same weight as previous? */
482 if (weights
[reverse
[i
]] == weights
[reverse
[i
-1]]) {
483 dprintk("same as previous\n");
487 /* adjust straw for next guy */
488 wbelow
+= ((double)weights
[reverse
[i
-1]] - lastw
) *
490 for (j
=i
; j
<size
; j
++)
491 if (weights
[reverse
[j
]] == weights
[reverse
[i
]])
495 wnext
= numleft
* (weights
[reverse
[i
]] -
496 weights
[reverse
[i
-1]]);
497 pbelow
= wbelow
/ (wbelow
+ wnext
);
498 dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
499 wbelow
, wnext
, pbelow
, numleft
);
501 straw
*= pow((double)1.0 / pbelow
, (double)1.0 /
504 lastw
= weights
[reverse
[i
-1]];
505 } else if (map
->straw_calc_version
>= 1) {
506 /* zero weight items get 0 length straws! */
507 if (weights
[reverse
[i
]] == 0) {
508 bucket
->straws
[reverse
[i
]] = 0;
514 /* set this item's straw */
515 bucket
->straws
[reverse
[i
]] = straw
* 0x10000;
516 dprintk("item %d at %d weight %d straw %d (%lf)\n",
517 bucket
->h
.items
[reverse
[i
]],
518 reverse
[i
], weights
[reverse
[i
]],
519 bucket
->straws
[reverse
[i
]], straw
);
524 /* adjust straw for next guy */
525 wbelow
+= ((double)weights
[reverse
[i
-1]] - lastw
) *
528 wnext
= numleft
* (weights
[reverse
[i
]] -
529 weights
[reverse
[i
-1]]);
530 pbelow
= wbelow
/ (wbelow
+ wnext
);
531 dprintk("wbelow %lf wnext %lf pbelow %lf numleft %d\n",
532 wbelow
, wnext
, pbelow
, numleft
);
534 straw
*= pow((double)1.0 / pbelow
, (double)1.0 /
537 lastw
= weights
[reverse
[i
-1]];
545 struct crush_bucket_straw
*
546 crush_make_straw_bucket(struct crush_map
*map
,
553 struct crush_bucket_straw
*bucket
;
556 bucket
= malloc(sizeof(*bucket
));
559 memset(bucket
, 0, sizeof(*bucket
));
560 bucket
->h
.alg
= CRUSH_BUCKET_STRAW
;
561 bucket
->h
.hash
= hash
;
562 bucket
->h
.type
= type
;
563 bucket
->h
.size
= size
;
565 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
566 if (!bucket
->h
.items
)
568 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
569 if (!bucket
->item_weights
)
571 bucket
->straws
= malloc(sizeof(__u32
)*size
);
575 bucket
->h
.weight
= 0;
576 for (i
=0; i
<size
; i
++) {
577 bucket
->h
.items
[i
] = items
[i
];
578 bucket
->h
.weight
+= weights
[i
];
579 bucket
->item_weights
[i
] = weights
[i
];
582 if (crush_calc_straw(map
, bucket
) < 0)
587 free(bucket
->straws
);
588 free(bucket
->item_weights
);
589 free(bucket
->h
.items
);
594 struct crush_bucket_straw2
*
595 crush_make_straw2_bucket(struct crush_map
*map
,
602 struct crush_bucket_straw2
*bucket
;
605 bucket
= malloc(sizeof(*bucket
));
608 memset(bucket
, 0, sizeof(*bucket
));
609 bucket
->h
.alg
= CRUSH_BUCKET_STRAW2
;
610 bucket
->h
.hash
= hash
;
611 bucket
->h
.type
= type
;
612 bucket
->h
.size
= size
;
614 bucket
->h
.items
= malloc(sizeof(__s32
)*size
);
615 if (!bucket
->h
.items
)
617 bucket
->item_weights
= malloc(sizeof(__u32
)*size
);
618 if (!bucket
->item_weights
)
621 bucket
->h
.weight
= 0;
622 for (i
=0; i
<size
; i
++) {
623 bucket
->h
.items
[i
] = items
[i
];
624 bucket
->h
.weight
+= weights
[i
];
625 bucket
->item_weights
[i
] = weights
[i
];
630 free(bucket
->item_weights
);
631 free(bucket
->h
.items
);
639 crush_make_bucket(struct crush_map
*map
,
640 int alg
, int hash
, int type
, int size
,
647 case CRUSH_BUCKET_UNIFORM
:
649 item_weight
= weights
[0];
652 return (struct crush_bucket
*)crush_make_uniform_bucket(hash
, type
, size
, items
, item_weight
);
654 case CRUSH_BUCKET_LIST
:
655 return (struct crush_bucket
*)crush_make_list_bucket(hash
, type
, size
, items
, weights
);
657 case CRUSH_BUCKET_TREE
:
658 return (struct crush_bucket
*)crush_make_tree_bucket(hash
, type
, size
, items
, weights
);
660 case CRUSH_BUCKET_STRAW
:
661 return (struct crush_bucket
*)crush_make_straw_bucket(map
, hash
, type
, size
, items
, weights
);
662 case CRUSH_BUCKET_STRAW2
:
663 return (struct crush_bucket
*)crush_make_straw2_bucket(map
, hash
, type
, size
, items
, weights
);
669 /************************************************/
671 int crush_add_uniform_bucket_item(struct crush_bucket_uniform
*bucket
, int item
, int weight
)
673 int newsize
= bucket
->h
.size
+ 1;
674 void *_realloc
= NULL
;
676 /* In such situation 'CRUSH_BUCKET_UNIFORM', the weight
677 provided for the item should be the same as
678 bucket->item_weight defined with 'crush_make_bucket'. This
679 assumption is enforced by the return value which is always
681 if (bucket
->item_weight
!= weight
) {
685 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
688 bucket
->h
.items
= _realloc
;
691 bucket
->h
.items
[newsize
-1] = item
;
693 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
696 bucket
->h
.weight
+= weight
;
702 int crush_add_list_bucket_item(struct crush_bucket_list
*bucket
, int item
, int weight
)
704 int newsize
= bucket
->h
.size
+ 1;
705 void *_realloc
= NULL
;
707 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
710 bucket
->h
.items
= _realloc
;
712 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
715 bucket
->item_weights
= _realloc
;
717 if ((_realloc
= realloc(bucket
->sum_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
720 bucket
->sum_weights
= _realloc
;
723 bucket
->h
.items
[newsize
-1] = item
;
724 bucket
->item_weights
[newsize
-1] = weight
;
727 if (crush_addition_is_unsafe(bucket
->sum_weights
[newsize
-2], weight
))
730 bucket
->sum_weights
[newsize
-1] = bucket
->sum_weights
[newsize
-2] + weight
;
734 bucket
->sum_weights
[newsize
-1] = weight
;
737 bucket
->h
.weight
+= weight
;
742 int crush_add_tree_bucket_item(struct crush_bucket_tree
*bucket
, int item
, int weight
)
744 int newsize
= bucket
->h
.size
+ 1;
745 int depth
= calc_depth(newsize
);;
748 void *_realloc
= NULL
;
750 bucket
->num_nodes
= 1 << depth
;
752 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
755 bucket
->h
.items
= _realloc
;
757 if ((_realloc
= realloc(bucket
->node_weights
, sizeof(__u32
)*bucket
->num_nodes
)) == NULL
) {
760 bucket
->node_weights
= _realloc
;
763 node
= crush_calc_tree_node(newsize
-1);
764 bucket
->node_weights
[node
] = weight
;
766 /* if the depth increase, we need to initialize the new root node's weight before add bucket item */
767 int root
= bucket
->num_nodes
/2;
768 if (depth
>= 2 && (node
- 1) == root
) {
769 /* if the new item is the first node in right sub tree, so
770 * the root node initial weight is left sub tree's weight
772 bucket
->node_weights
[root
] = bucket
->node_weights
[root
/2];
775 for (j
=1; j
<depth
; j
++) {
778 if (crush_addition_is_unsafe(bucket
->node_weights
[node
], weight
))
781 bucket
->node_weights
[node
] += weight
;
782 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
786 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
789 bucket
->h
.items
[newsize
-1] = item
;
790 bucket
->h
.weight
+= weight
;
796 int crush_add_straw_bucket_item(struct crush_map
*map
,
797 struct crush_bucket_straw
*bucket
,
798 int item
, int weight
)
800 int newsize
= bucket
->h
.size
+ 1;
802 void *_realloc
= NULL
;
804 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
807 bucket
->h
.items
= _realloc
;
809 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
812 bucket
->item_weights
= _realloc
;
814 if ((_realloc
= realloc(bucket
->straws
, sizeof(__u32
)*newsize
)) == NULL
) {
817 bucket
->straws
= _realloc
;
820 bucket
->h
.items
[newsize
-1] = item
;
821 bucket
->item_weights
[newsize
-1] = weight
;
823 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
826 bucket
->h
.weight
+= weight
;
829 return crush_calc_straw(map
, bucket
);
832 int crush_add_straw2_bucket_item(struct crush_map
*map
,
833 struct crush_bucket_straw2
*bucket
,
834 int item
, int weight
)
836 int newsize
= bucket
->h
.size
+ 1;
838 void *_realloc
= NULL
;
840 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
843 bucket
->h
.items
= _realloc
;
845 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
848 bucket
->item_weights
= _realloc
;
851 bucket
->h
.items
[newsize
-1] = item
;
852 bucket
->item_weights
[newsize
-1] = weight
;
854 if (crush_addition_is_unsafe(bucket
->h
.weight
, weight
))
857 bucket
->h
.weight
+= weight
;
863 int crush_bucket_add_item(struct crush_map
*map
,
864 struct crush_bucket
*b
, int item
, int weight
)
867 case CRUSH_BUCKET_UNIFORM
:
868 return crush_add_uniform_bucket_item((struct crush_bucket_uniform
*)b
, item
, weight
);
869 case CRUSH_BUCKET_LIST
:
870 return crush_add_list_bucket_item((struct crush_bucket_list
*)b
, item
, weight
);
871 case CRUSH_BUCKET_TREE
:
872 return crush_add_tree_bucket_item((struct crush_bucket_tree
*)b
, item
, weight
);
873 case CRUSH_BUCKET_STRAW
:
874 return crush_add_straw_bucket_item(map
, (struct crush_bucket_straw
*)b
, item
, weight
);
875 case CRUSH_BUCKET_STRAW2
:
876 return crush_add_straw2_bucket_item(map
, (struct crush_bucket_straw2
*)b
, item
, weight
);
882 /************************************************/
884 int crush_remove_uniform_bucket_item(struct crush_bucket_uniform
*bucket
, int item
)
888 void *_realloc
= NULL
;
890 for (i
= 0; i
< bucket
->h
.size
; i
++)
891 if (bucket
->h
.items
[i
] == item
)
893 if (i
== bucket
->h
.size
)
896 for (j
= i
; j
< bucket
->h
.size
; j
++)
897 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
898 newsize
= --bucket
->h
.size
;
899 if (bucket
->item_weight
< bucket
->h
.weight
)
900 bucket
->h
.weight
-= bucket
->item_weight
;
902 bucket
->h
.weight
= 0;
904 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
907 bucket
->h
.items
= _realloc
;
912 int crush_remove_list_bucket_item(struct crush_bucket_list
*bucket
, int item
)
918 for (i
= 0; i
< bucket
->h
.size
; i
++)
919 if (bucket
->h
.items
[i
] == item
)
921 if (i
== bucket
->h
.size
)
924 weight
= bucket
->item_weights
[i
];
925 for (j
= i
; j
< bucket
->h
.size
; j
++) {
926 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
927 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
928 bucket
->sum_weights
[j
] = bucket
->sum_weights
[j
+1] - weight
;
930 if (weight
< bucket
->h
.weight
)
931 bucket
->h
.weight
-= weight
;
933 bucket
->h
.weight
= 0;
934 newsize
= --bucket
->h
.size
;
936 void *_realloc
= NULL
;
938 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
941 bucket
->h
.items
= _realloc
;
943 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
946 bucket
->item_weights
= _realloc
;
948 if ((_realloc
= realloc(bucket
->sum_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
951 bucket
->sum_weights
= _realloc
;
956 int crush_remove_tree_bucket_item(struct crush_bucket_tree
*bucket
, int item
)
961 for (i
= 0; i
< bucket
->h
.size
; i
++) {
965 int depth
= calc_depth(bucket
->h
.size
);
967 if (bucket
->h
.items
[i
] != item
)
970 bucket
->h
.items
[i
] = 0;
971 node
= crush_calc_tree_node(i
);
972 weight
= bucket
->node_weights
[node
];
973 bucket
->node_weights
[node
] = 0;
975 for (j
= 1; j
< depth
; j
++) {
977 bucket
->node_weights
[node
] -= weight
;
978 dprintk(" node %d weight %d\n", node
, bucket
->node_weights
[node
]);
980 if (weight
< bucket
->h
.weight
)
981 bucket
->h
.weight
-= weight
;
983 bucket
->h
.weight
= 0;
986 if (i
== bucket
->h
.size
)
989 newsize
= bucket
->h
.size
;
990 while (newsize
> 0) {
991 int node
= crush_calc_tree_node(newsize
- 1);
992 if (bucket
->node_weights
[node
])
997 if (newsize
!= bucket
->h
.size
) {
998 int olddepth
, newdepth
;
1000 void *_realloc
= NULL
;
1002 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1005 bucket
->h
.items
= _realloc
;
1008 olddepth
= calc_depth(bucket
->h
.size
);
1009 newdepth
= calc_depth(newsize
);
1010 if (olddepth
!= newdepth
) {
1011 bucket
->num_nodes
= 1 << newdepth
;
1012 if ((_realloc
= realloc(bucket
->node_weights
,
1013 sizeof(__u32
)*bucket
->num_nodes
)) == NULL
) {
1016 bucket
->node_weights
= _realloc
;
1020 bucket
->h
.size
= newsize
;
1025 int crush_remove_straw_bucket_item(struct crush_map
*map
,
1026 struct crush_bucket_straw
*bucket
, int item
)
1028 int newsize
= bucket
->h
.size
- 1;
1031 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1032 if (bucket
->h
.items
[i
] == item
) {
1033 if (bucket
->item_weights
[i
] < bucket
->h
.weight
)
1034 bucket
->h
.weight
-= bucket
->item_weights
[i
];
1036 bucket
->h
.weight
= 0;
1037 for (j
= i
; j
< bucket
->h
.size
- 1; j
++) {
1038 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
1039 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
1044 if (i
== bucket
->h
.size
)
1047 if (bucket
->h
.size
== 0) {
1048 /* don't bother reallocating */
1051 void *_realloc
= NULL
;
1053 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1056 bucket
->h
.items
= _realloc
;
1058 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
1061 bucket
->item_weights
= _realloc
;
1063 if ((_realloc
= realloc(bucket
->straws
, sizeof(__u32
)*newsize
)) == NULL
) {
1066 bucket
->straws
= _realloc
;
1069 return crush_calc_straw(map
, bucket
);
1072 int crush_remove_straw2_bucket_item(struct crush_map
*map
,
1073 struct crush_bucket_straw2
*bucket
, int item
)
1075 int newsize
= bucket
->h
.size
- 1;
1078 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1079 if (bucket
->h
.items
[i
] == item
) {
1080 if (bucket
->item_weights
[i
] < bucket
->h
.weight
)
1081 bucket
->h
.weight
-= bucket
->item_weights
[i
];
1083 bucket
->h
.weight
= 0;
1084 for (j
= i
; j
< bucket
->h
.size
- 1; j
++) {
1085 bucket
->h
.items
[j
] = bucket
->h
.items
[j
+1];
1086 bucket
->item_weights
[j
] = bucket
->item_weights
[j
+1];
1091 if (i
== bucket
->h
.size
)
1096 /* don't bother reallocating a 0-length array. */
1100 void *_realloc
= NULL
;
1102 if ((_realloc
= realloc(bucket
->h
.items
, sizeof(__s32
)*newsize
)) == NULL
) {
1105 bucket
->h
.items
= _realloc
;
1107 if ((_realloc
= realloc(bucket
->item_weights
, sizeof(__u32
)*newsize
)) == NULL
) {
1110 bucket
->item_weights
= _realloc
;
1116 int crush_bucket_remove_item(struct crush_map
*map
, struct crush_bucket
*b
, int item
)
1119 case CRUSH_BUCKET_UNIFORM
:
1120 return crush_remove_uniform_bucket_item((struct crush_bucket_uniform
*)b
, item
);
1121 case CRUSH_BUCKET_LIST
:
1122 return crush_remove_list_bucket_item((struct crush_bucket_list
*)b
, item
);
1123 case CRUSH_BUCKET_TREE
:
1124 return crush_remove_tree_bucket_item((struct crush_bucket_tree
*)b
, item
);
1125 case CRUSH_BUCKET_STRAW
:
1126 return crush_remove_straw_bucket_item(map
, (struct crush_bucket_straw
*)b
, item
);
1127 case CRUSH_BUCKET_STRAW2
:
1128 return crush_remove_straw2_bucket_item(map
, (struct crush_bucket_straw2
*)b
, item
);
1135 /************************************************/
1137 int crush_adjust_uniform_bucket_item_weight(struct crush_bucket_uniform
*bucket
, int item
, int weight
)
1139 int diff
= (weight
- bucket
->item_weight
) * bucket
->h
.size
;
1141 bucket
->item_weight
= weight
;
1142 bucket
->h
.weight
= bucket
->item_weight
* bucket
->h
.size
;
1147 int crush_adjust_list_bucket_item_weight(struct crush_bucket_list
*bucket
, int item
, int weight
)
1152 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1153 if (bucket
->h
.items
[i
] == item
)
1156 if (i
== bucket
->h
.size
)
1159 diff
= weight
- bucket
->item_weights
[i
];
1160 bucket
->item_weights
[i
] = weight
;
1161 bucket
->h
.weight
+= diff
;
1163 for (j
= i
; j
< bucket
->h
.size
; j
++)
1164 bucket
->sum_weights
[j
] += diff
;
1169 int crush_adjust_tree_bucket_item_weight(struct crush_bucket_tree
*bucket
, int item
, int weight
)
1174 unsigned depth
= calc_depth(bucket
->h
.size
);
1176 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1177 if (bucket
->h
.items
[i
] == item
)
1180 if (i
== bucket
->h
.size
)
1183 node
= crush_calc_tree_node(i
);
1184 diff
= weight
- bucket
->node_weights
[node
];
1185 bucket
->node_weights
[node
] = weight
;
1186 bucket
->h
.weight
+= diff
;
1188 for (j
=1; j
<depth
; j
++) {
1189 node
= parent(node
);
1190 bucket
->node_weights
[node
] += diff
;
1196 int crush_adjust_straw_bucket_item_weight(struct crush_map
*map
,
1197 struct crush_bucket_straw
*bucket
,
1198 int item
, int weight
)
1204 for (idx
= 0; idx
< bucket
->h
.size
; idx
++)
1205 if (bucket
->h
.items
[idx
] == item
)
1207 if (idx
== bucket
->h
.size
)
1210 diff
= weight
- bucket
->item_weights
[idx
];
1211 bucket
->item_weights
[idx
] = weight
;
1212 bucket
->h
.weight
+= diff
;
1214 r
= crush_calc_straw(map
, bucket
);
1221 int crush_adjust_straw2_bucket_item_weight(struct crush_map
*map
,
1222 struct crush_bucket_straw2
*bucket
,
1223 int item
, int weight
)
1228 for (idx
= 0; idx
< bucket
->h
.size
; idx
++)
1229 if (bucket
->h
.items
[idx
] == item
)
1231 if (idx
== bucket
->h
.size
)
1234 diff
= weight
- bucket
->item_weights
[idx
];
1235 bucket
->item_weights
[idx
] = weight
;
1236 bucket
->h
.weight
+= diff
;
1241 int crush_bucket_adjust_item_weight(struct crush_map
*map
,
1242 struct crush_bucket
*b
,
1243 int item
, int weight
)
1246 case CRUSH_BUCKET_UNIFORM
:
1247 return crush_adjust_uniform_bucket_item_weight((struct crush_bucket_uniform
*)b
,
1249 case CRUSH_BUCKET_LIST
:
1250 return crush_adjust_list_bucket_item_weight((struct crush_bucket_list
*)b
,
1252 case CRUSH_BUCKET_TREE
:
1253 return crush_adjust_tree_bucket_item_weight((struct crush_bucket_tree
*)b
,
1255 case CRUSH_BUCKET_STRAW
:
1256 return crush_adjust_straw_bucket_item_weight(map
,
1257 (struct crush_bucket_straw
*)b
,
1259 case CRUSH_BUCKET_STRAW2
:
1260 return crush_adjust_straw2_bucket_item_weight(map
,
1261 (struct crush_bucket_straw2
*)b
,
1268 /************************************************/
1270 static int crush_reweight_uniform_bucket(struct crush_map
*map
, struct crush_bucket_uniform
*bucket
)
1273 unsigned sum
= 0, n
= 0, leaves
= 0;
1275 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1276 int id
= bucket
->h
.items
[i
];
1278 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1279 crush_reweight_bucket(map
, c
);
1281 if (crush_addition_is_unsafe(sum
, c
->weight
))
1292 bucket
->item_weight
= sum
/ n
; // more bucket children than leaves, average!
1293 bucket
->h
.weight
= bucket
->item_weight
* bucket
->h
.size
;
1298 static int crush_reweight_list_bucket(struct crush_map
*map
, struct crush_bucket_list
*bucket
)
1302 bucket
->h
.weight
= 0;
1303 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1304 int id
= bucket
->h
.items
[i
];
1306 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1307 crush_reweight_bucket(map
, c
);
1308 bucket
->item_weights
[i
] = c
->weight
;
1311 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1314 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1320 static int crush_reweight_tree_bucket(struct crush_map
*map
, struct crush_bucket_tree
*bucket
)
1324 bucket
->h
.weight
= 0;
1325 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1326 int node
= crush_calc_tree_node(i
);
1327 int id
= bucket
->h
.items
[i
];
1329 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1330 crush_reweight_bucket(map
, c
);
1331 bucket
->node_weights
[node
] = c
->weight
;
1334 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->node_weights
[node
]))
1337 bucket
->h
.weight
+= bucket
->node_weights
[node
];
1345 static int crush_reweight_straw_bucket(struct crush_map
*map
, struct crush_bucket_straw
*bucket
)
1349 bucket
->h
.weight
= 0;
1350 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1351 int id
= bucket
->h
.items
[i
];
1353 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1354 crush_reweight_bucket(map
, c
);
1355 bucket
->item_weights
[i
] = c
->weight
;
1358 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1361 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1363 crush_calc_straw(map
, bucket
);
1368 static int crush_reweight_straw2_bucket(struct crush_map
*map
, struct crush_bucket_straw2
*bucket
)
1372 bucket
->h
.weight
= 0;
1373 for (i
= 0; i
< bucket
->h
.size
; i
++) {
1374 int id
= bucket
->h
.items
[i
];
1376 struct crush_bucket
*c
= map
->buckets
[-1-id
];
1377 crush_reweight_bucket(map
, c
);
1378 bucket
->item_weights
[i
] = c
->weight
;
1381 if (crush_addition_is_unsafe(bucket
->h
.weight
, bucket
->item_weights
[i
]))
1384 bucket
->h
.weight
+= bucket
->item_weights
[i
];
1390 int crush_reweight_bucket(struct crush_map
*map
, struct crush_bucket
*b
)
1393 case CRUSH_BUCKET_UNIFORM
:
1394 return crush_reweight_uniform_bucket(map
, (struct crush_bucket_uniform
*)b
);
1395 case CRUSH_BUCKET_LIST
:
1396 return crush_reweight_list_bucket(map
, (struct crush_bucket_list
*)b
);
1397 case CRUSH_BUCKET_TREE
:
1398 return crush_reweight_tree_bucket(map
, (struct crush_bucket_tree
*)b
);
1399 case CRUSH_BUCKET_STRAW
:
1400 return crush_reweight_straw_bucket(map
, (struct crush_bucket_straw
*)b
);
1401 case CRUSH_BUCKET_STRAW2
:
1402 return crush_reweight_straw2_bucket(map
, (struct crush_bucket_straw2
*)b
);
1408 struct crush_choose_arg
*crush_make_choose_args(struct crush_map
*map
, int num_positions
)
1411 int sum_bucket_size
= 0;
1412 int bucket_count
= 0;
1413 for (b
= 0; b
< map
->max_buckets
; b
++) {
1414 if (map
->buckets
[b
] == 0)
1416 sum_bucket_size
+= map
->buckets
[b
]->size
;
1419 dprintk("sum_bucket_size %d max_buckets %d bucket_count %d\n",
1420 sum_bucket_size
, map
->max_buckets
, bucket_count
);
1421 int size
= (sizeof(struct crush_choose_arg
) * map
->max_buckets
+
1422 sizeof(struct crush_weight_set
) * bucket_count
* num_positions
+
1423 sizeof(__u32
) * sum_bucket_size
* num_positions
+ // weights
1424 sizeof(__s32
) * sum_bucket_size
); // ids
1425 char *space
= malloc(size
);
1426 struct crush_choose_arg
*arg
= (struct crush_choose_arg
*)space
;
1427 struct crush_weight_set
*weight_set
= (struct crush_weight_set
*)(arg
+ map
->max_buckets
);
1428 __u32
*weights
= (__u32
*)(weight_set
+ bucket_count
* num_positions
);
1429 char *weight_set_ends
= (char*)weights
;
1430 __s32
*ids
= (__s32
*)(weights
+ sum_bucket_size
* num_positions
);
1431 char *weights_end
= (char *)ids
;
1432 char *ids_end
= (char *)(ids
+ sum_bucket_size
);
1433 BUG_ON(space
+ size
!= ids_end
);
1434 for (b
= 0; b
< map
->max_buckets
; b
++) {
1435 if (map
->buckets
[b
] == 0) {
1436 memset(&arg
[b
], '\0', sizeof(struct crush_choose_arg
));
1439 struct crush_bucket_straw2
*bucket
= (struct crush_bucket_straw2
*)map
->buckets
[b
];
1442 for (position
= 0; position
< num_positions
; position
++) {
1443 memcpy(weights
, bucket
->item_weights
, sizeof(__u32
) * bucket
->h
.size
);
1444 weight_set
[position
].weights
= weights
;
1445 weight_set
[position
].size
= bucket
->h
.size
;
1446 dprintk("moving weight %d bytes forward\n", (int)((weights
+ bucket
->h
.size
) - weights
));
1447 weights
+= bucket
->h
.size
;
1449 arg
[b
].weight_set
= weight_set
;
1450 arg
[b
].weight_set_positions
= num_positions
;
1451 weight_set
+= position
;
1453 memcpy(ids
, bucket
->h
.items
, sizeof(__s32
) * bucket
->h
.size
);
1455 arg
[b
].ids_size
= bucket
->h
.size
;
1456 ids
+= bucket
->h
.size
;
1458 BUG_ON((char*)weight_set_ends
!= (char*)weight_set
);
1459 BUG_ON((char*)weights_end
!= (char*)weights
);
1460 BUG_ON((char*)ids
!= (char*)ids_end
);
1464 void crush_destroy_choose_args(struct crush_choose_arg
*args
)
1469 /***************************/
1471 /* methods to check for safe arithmetic operations */
1473 int crush_addition_is_unsafe(__u32 a
, __u32 b
)
1475 if ((((__u32
)(-1)) - b
) < a
)
1481 int crush_multiplication_is_unsafe(__u32 a
, __u32 b
)
1483 /* prevent division by zero */
1488 if ((((__u32
)(-1)) / b
) < a
)
1494 /***************************/
1496 /* methods to configure crush_map */
1498 void set_legacy_crush_map(struct crush_map
*map
) {
1499 /* initialize legacy tunable values */
1500 map
->choose_local_tries
= 2;
1501 map
->choose_local_fallback_tries
= 5;
1502 map
->choose_total_tries
= 19;
1503 map
->chooseleaf_descend_once
= 0;
1504 map
->chooseleaf_vary_r
= 0;
1505 map
->chooseleaf_stable
= 0;
1506 map
->straw_calc_version
= 0;
1508 // by default, use legacy types, and also exclude tree,
1509 // since it was buggy.
1510 map
->allowed_bucket_algs
= CRUSH_LEGACY_ALLOWED_BUCKET_ALGS
;
1513 void set_optimal_crush_map(struct crush_map
*map
) {
1514 map
->choose_local_tries
= 0;
1515 map
->choose_local_fallback_tries
= 0;
1516 map
->choose_total_tries
= 50;
1517 map
->chooseleaf_descend_once
= 1;
1518 map
->chooseleaf_vary_r
= 1;
1519 map
->chooseleaf_stable
= 1;
1520 map
->allowed_bucket_algs
= (
1521 (1 << CRUSH_BUCKET_UNIFORM
) |
1522 (1 << CRUSH_BUCKET_LIST
) |
1523 (1 << CRUSH_BUCKET_STRAW
) |
1524 (1 << CRUSH_BUCKET_STRAW2
));