]>
Commit | Line | Data |
---|---|---|
5ecc0a0f SW |
1 | |
2 | #ifdef __KERNEL__ | |
3 | # include <linux/string.h> | |
4 | # include <linux/slab.h> | |
5 | # include <linux/bug.h> | |
6 | # include <linux/kernel.h> | |
7 | # ifndef dprintk | |
8 | # define dprintk(args...) | |
9 | # endif | |
10 | #else | |
11 | # include <string.h> | |
12 | # include <stdio.h> | |
13 | # include <stdlib.h> | |
14 | # include <assert.h> | |
15 | # define BUG_ON(x) assert(!(x)) | |
16 | # define dprintk(args...) /* printf(args) */ | |
17 | # define kmalloc(x, f) malloc(x) | |
18 | # define kfree(x) free(x) | |
19 | #endif | |
20 | ||
3d14c5d2 YS |
21 | #include <linux/crush/crush.h> |
22 | #include <linux/crush/hash.h> | |
5ecc0a0f SW |
23 | |
24 | /* | |
25 | * Implement the core CRUSH mapping algorithm. | |
26 | */ | |
27 | ||
28 | /** | |
29 | * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. | |
30 | * @map: the crush_map | |
31 | * @ruleset: the storage ruleset id (user defined) | |
32 | * @type: storage ruleset type (user defined) | |
33 | * @size: output set size | |
34 | */ | |
35 | int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) | |
36 | { | |
37 | int i; | |
38 | ||
39 | for (i = 0; i < map->max_rules; i++) { | |
40 | if (map->rules[i] && | |
41 | map->rules[i]->mask.ruleset == ruleset && | |
42 | map->rules[i]->mask.type == type && | |
43 | map->rules[i]->mask.min_size <= size && | |
44 | map->rules[i]->mask.max_size >= size) | |
45 | return i; | |
46 | } | |
47 | return -1; | |
48 | } | |
49 | ||
50 | ||
51 | /* | |
52 | * bucket choose methods | |
53 | * | |
54 | * For each bucket algorithm, we have a "choose" method that, given a | |
55 | * crush input @x and replica position (usually, position in output set) @r, | |
56 | * will produce an item in the bucket. | |
57 | */ | |
58 | ||
59 | /* | |
60 | * Choose based on a random permutation of the bucket. | |
61 | * | |
62 | * We used to use some prime number arithmetic to do this, but it | |
63 | * wasn't very random, and had some other bad behaviors. Instead, we | |
64 | * calculate an actual random permutation of the bucket members. | |
65 | * Since this is expensive, we optimize for the r=0 case, which | |
66 | * captures the vast majority of calls. | |
67 | */ | |
68 | static int bucket_perm_choose(struct crush_bucket *bucket, | |
69 | int x, int r) | |
70 | { | |
71 | unsigned pr = r % bucket->size; | |
72 | unsigned i, s; | |
73 | ||
74 | /* start a new permutation if @x has changed */ | |
75 | if (bucket->perm_x != x || bucket->perm_n == 0) { | |
76 | dprintk("bucket %d new x=%d\n", bucket->id, x); | |
77 | bucket->perm_x = x; | |
78 | ||
79 | /* optimize common r=0 case */ | |
80 | if (pr == 0) { | |
fb690390 | 81 | s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % |
5ecc0a0f SW |
82 | bucket->size; |
83 | bucket->perm[0] = s; | |
84 | bucket->perm_n = 0xffff; /* magic value, see below */ | |
85 | goto out; | |
86 | } | |
87 | ||
88 | for (i = 0; i < bucket->size; i++) | |
89 | bucket->perm[i] = i; | |
90 | bucket->perm_n = 0; | |
91 | } else if (bucket->perm_n == 0xffff) { | |
92 | /* clean up after the r=0 case above */ | |
93 | for (i = 1; i < bucket->size; i++) | |
94 | bucket->perm[i] = i; | |
95 | bucket->perm[bucket->perm[0]] = 0; | |
96 | bucket->perm_n = 1; | |
97 | } | |
98 | ||
99 | /* calculate permutation up to pr */ | |
100 | for (i = 0; i < bucket->perm_n; i++) | |
101 | dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); | |
102 | while (bucket->perm_n <= pr) { | |
103 | unsigned p = bucket->perm_n; | |
104 | /* no point in swapping the final entry */ | |
105 | if (p < bucket->size - 1) { | |
fb690390 | 106 | i = crush_hash32_3(bucket->hash, x, bucket->id, p) % |
5ecc0a0f SW |
107 | (bucket->size - p); |
108 | if (i) { | |
109 | unsigned t = bucket->perm[p + i]; | |
110 | bucket->perm[p + i] = bucket->perm[p]; | |
111 | bucket->perm[p] = t; | |
112 | } | |
113 | dprintk(" perm_choose swap %d with %d\n", p, p+i); | |
114 | } | |
115 | bucket->perm_n++; | |
116 | } | |
117 | for (i = 0; i < bucket->size; i++) | |
118 | dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); | |
119 | ||
120 | s = bucket->perm[pr]; | |
121 | out: | |
122 | dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, | |
123 | bucket->size, x, r, pr, s); | |
124 | return bucket->items[s]; | |
125 | } | |
126 | ||
127 | /* uniform */ | |
128 | static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, | |
129 | int x, int r) | |
130 | { | |
131 | return bucket_perm_choose(&bucket->h, x, r); | |
132 | } | |
133 | ||
134 | /* list */ | |
135 | static int bucket_list_choose(struct crush_bucket_list *bucket, | |
136 | int x, int r) | |
137 | { | |
138 | int i; | |
139 | ||
140 | for (i = bucket->h.size-1; i >= 0; i--) { | |
fb690390 SW |
141 | __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], |
142 | r, bucket->h.id); | |
5ecc0a0f SW |
143 | w &= 0xffff; |
144 | dprintk("list_choose i=%d x=%d r=%d item %d weight %x " | |
145 | "sw %x rand %llx", | |
146 | i, x, r, bucket->h.items[i], bucket->item_weights[i], | |
147 | bucket->sum_weights[i], w); | |
148 | w *= bucket->sum_weights[i]; | |
149 | w = w >> 16; | |
150 | /*dprintk(" scaled %llx\n", w);*/ | |
151 | if (w < bucket->item_weights[i]) | |
152 | return bucket->h.items[i]; | |
153 | } | |
154 | ||
155 | BUG_ON(1); | |
156 | return 0; | |
157 | } | |
158 | ||
159 | ||
160 | /* (binary) tree */ | |
161 | static int height(int n) | |
162 | { | |
163 | int h = 0; | |
164 | while ((n & 1) == 0) { | |
165 | h++; | |
166 | n = n >> 1; | |
167 | } | |
168 | return h; | |
169 | } | |
170 | ||
171 | static int left(int x) | |
172 | { | |
173 | int h = height(x); | |
174 | return x - (1 << (h-1)); | |
175 | } | |
176 | ||
177 | static int right(int x) | |
178 | { | |
179 | int h = height(x); | |
180 | return x + (1 << (h-1)); | |
181 | } | |
182 | ||
183 | static int terminal(int x) | |
184 | { | |
185 | return x & 1; | |
186 | } | |
187 | ||
188 | static int bucket_tree_choose(struct crush_bucket_tree *bucket, | |
189 | int x, int r) | |
190 | { | |
191 | int n, l; | |
192 | __u32 w; | |
193 | __u64 t; | |
194 | ||
195 | /* start at root */ | |
196 | n = bucket->num_nodes >> 1; | |
197 | ||
198 | while (!terminal(n)) { | |
199 | /* pick point in [0, w) */ | |
200 | w = bucket->node_weights[n]; | |
fb690390 SW |
201 | t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, |
202 | bucket->h.id) * (__u64)w; | |
5ecc0a0f SW |
203 | t = t >> 32; |
204 | ||
205 | /* descend to the left or right? */ | |
206 | l = left(n); | |
207 | if (t < bucket->node_weights[l]) | |
208 | n = l; | |
209 | else | |
210 | n = right(n); | |
211 | } | |
212 | ||
213 | return bucket->h.items[n >> 1]; | |
214 | } | |
215 | ||
216 | ||
217 | /* straw */ | |
218 | ||
219 | static int bucket_straw_choose(struct crush_bucket_straw *bucket, | |
220 | int x, int r) | |
221 | { | |
222 | int i; | |
223 | int high = 0; | |
224 | __u64 high_draw = 0; | |
225 | __u64 draw; | |
226 | ||
227 | for (i = 0; i < bucket->h.size; i++) { | |
fb690390 | 228 | draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); |
5ecc0a0f SW |
229 | draw &= 0xffff; |
230 | draw *= bucket->straws[i]; | |
231 | if (i == 0 || draw > high_draw) { | |
232 | high = i; | |
233 | high_draw = draw; | |
234 | } | |
235 | } | |
236 | return bucket->h.items[high]; | |
237 | } | |
238 | ||
239 | static int crush_bucket_choose(struct crush_bucket *in, int x, int r) | |
240 | { | |
a1a31e73 | 241 | dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); |
5ecc0a0f SW |
242 | switch (in->alg) { |
243 | case CRUSH_BUCKET_UNIFORM: | |
244 | return bucket_uniform_choose((struct crush_bucket_uniform *)in, | |
245 | x, r); | |
246 | case CRUSH_BUCKET_LIST: | |
247 | return bucket_list_choose((struct crush_bucket_list *)in, | |
248 | x, r); | |
249 | case CRUSH_BUCKET_TREE: | |
250 | return bucket_tree_choose((struct crush_bucket_tree *)in, | |
251 | x, r); | |
252 | case CRUSH_BUCKET_STRAW: | |
253 | return bucket_straw_choose((struct crush_bucket_straw *)in, | |
254 | x, r); | |
255 | default: | |
256 | BUG_ON(1); | |
50b885b9 | 257 | return in->items[0]; |
5ecc0a0f SW |
258 | } |
259 | } | |
260 | ||
261 | /* | |
262 | * true if device is marked "out" (failed, fully offloaded) | |
263 | * of the cluster | |
264 | */ | |
265 | static int is_out(struct crush_map *map, __u32 *weight, int item, int x) | |
266 | { | |
153a1093 | 267 | if (weight[item] >= 0x10000) |
5ecc0a0f SW |
268 | return 0; |
269 | if (weight[item] == 0) | |
270 | return 1; | |
fb690390 SW |
271 | if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) |
272 | < weight[item]) | |
5ecc0a0f SW |
273 | return 0; |
274 | return 1; | |
275 | } | |
276 | ||
277 | /** | |
278 | * crush_choose - choose numrep distinct items of given type | |
279 | * @map: the crush_map | |
280 | * @bucket: the bucket we are choose an item from | |
281 | * @x: crush input value | |
282 | * @numrep: the number of items to choose | |
283 | * @type: the type of item to choose | |
284 | * @out: pointer to output vector | |
285 | * @outpos: our position in that vector | |
286 | * @firstn: true if choosing "first n" items, false if choosing "indep" | |
287 | * @recurse_to_leaf: true if we want one device under each item of given type | |
288 | * @out2: second output vector for leaf items (if @recurse_to_leaf) | |
289 | */ | |
290 | static int crush_choose(struct crush_map *map, | |
291 | struct crush_bucket *bucket, | |
292 | __u32 *weight, | |
293 | int x, int numrep, int type, | |
294 | int *out, int outpos, | |
295 | int firstn, int recurse_to_leaf, | |
296 | int *out2) | |
297 | { | |
298 | int rep; | |
299 | int ftotal, flocal; | |
300 | int retry_descent, retry_bucket, skip_rep; | |
301 | struct crush_bucket *in = bucket; | |
302 | int r; | |
303 | int i; | |
b28813a6 | 304 | int item = 0; |
5ecc0a0f SW |
305 | int itemtype; |
306 | int collide, reject; | |
307 | const int orig_tries = 5; /* attempts before we fall back to search */ | |
a1a31e73 SW |
308 | |
309 | dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", | |
310 | bucket->id, x, outpos, numrep); | |
5ecc0a0f SW |
311 | |
312 | for (rep = outpos; rep < numrep; rep++) { | |
313 | /* keep trying until we get a non-out, non-colliding item */ | |
314 | ftotal = 0; | |
315 | skip_rep = 0; | |
316 | do { | |
317 | retry_descent = 0; | |
318 | in = bucket; /* initial bucket */ | |
319 | ||
320 | /* choose through intervening buckets */ | |
321 | flocal = 0; | |
322 | do { | |
b28813a6 | 323 | collide = 0; |
5ecc0a0f SW |
324 | retry_bucket = 0; |
325 | r = rep; | |
326 | if (in->alg == CRUSH_BUCKET_UNIFORM) { | |
327 | /* be careful */ | |
328 | if (firstn || numrep >= in->size) | |
329 | /* r' = r + f_total */ | |
330 | r += ftotal; | |
331 | else if (in->size % numrep == 0) | |
332 | /* r'=r+(n+1)*f_local */ | |
333 | r += (numrep+1) * | |
334 | (flocal+ftotal); | |
335 | else | |
336 | /* r' = r + n*f_local */ | |
337 | r += numrep * (flocal+ftotal); | |
338 | } else { | |
339 | if (firstn) | |
340 | /* r' = r + f_total */ | |
341 | r += ftotal; | |
342 | else | |
343 | /* r' = r + n*f_local */ | |
344 | r += numrep * (flocal+ftotal); | |
345 | } | |
346 | ||
347 | /* bucket choose */ | |
b28813a6 SW |
348 | if (in->size == 0) { |
349 | reject = 1; | |
350 | goto reject; | |
351 | } | |
5ecc0a0f SW |
352 | if (flocal >= (in->size>>1) && |
353 | flocal > orig_tries) | |
354 | item = bucket_perm_choose(in, x, r); | |
355 | else | |
356 | item = crush_bucket_choose(in, x, r); | |
357 | BUG_ON(item >= map->max_devices); | |
358 | ||
359 | /* desired type? */ | |
360 | if (item < 0) | |
361 | itemtype = map->buckets[-1-item]->type; | |
362 | else | |
363 | itemtype = 0; | |
364 | dprintk(" item %d type %d\n", item, itemtype); | |
365 | ||
366 | /* keep going? */ | |
367 | if (itemtype != type) { | |
368 | BUG_ON(item >= 0 || | |
369 | (-1-item) >= map->max_buckets); | |
370 | in = map->buckets[-1-item]; | |
55bda7aa | 371 | retry_bucket = 1; |
5ecc0a0f SW |
372 | continue; |
373 | } | |
374 | ||
375 | /* collision? */ | |
5ecc0a0f SW |
376 | for (i = 0; i < outpos; i++) { |
377 | if (out[i] == item) { | |
378 | collide = 1; | |
379 | break; | |
380 | } | |
381 | } | |
382 | ||
a1a31e73 SW |
383 | reject = 0; |
384 | if (recurse_to_leaf) { | |
385 | if (item < 0) { | |
386 | if (crush_choose(map, | |
387 | map->buckets[-1-item], | |
388 | weight, | |
389 | x, outpos+1, 0, | |
390 | out2, outpos, | |
391 | firstn, 0, | |
392 | NULL) <= outpos) | |
393 | /* didn't get leaf */ | |
394 | reject = 1; | |
395 | } else { | |
396 | /* we already have a leaf! */ | |
397 | out2[outpos] = item; | |
398 | } | |
399 | } | |
400 | ||
401 | if (!reject) { | |
5ecc0a0f SW |
402 | /* out? */ |
403 | if (itemtype == 0) | |
404 | reject = is_out(map, weight, | |
405 | item, x); | |
406 | else | |
407 | reject = 0; | |
408 | } | |
409 | ||
b28813a6 | 410 | reject: |
5ecc0a0f SW |
411 | if (reject || collide) { |
412 | ftotal++; | |
413 | flocal++; | |
414 | ||
415 | if (collide && flocal < 3) | |
416 | /* retry locally a few times */ | |
417 | retry_bucket = 1; | |
418 | else if (flocal < in->size + orig_tries) | |
419 | /* exhaustive bucket search */ | |
420 | retry_bucket = 1; | |
421 | else if (ftotal < 20) | |
422 | /* then retry descent */ | |
423 | retry_descent = 1; | |
424 | else | |
425 | /* else give up */ | |
426 | skip_rep = 1; | |
427 | dprintk(" reject %d collide %d " | |
428 | "ftotal %d flocal %d\n", | |
429 | reject, collide, ftotal, | |
430 | flocal); | |
431 | } | |
432 | } while (retry_bucket); | |
433 | } while (retry_descent); | |
434 | ||
435 | if (skip_rep) { | |
436 | dprintk("skip rep\n"); | |
437 | continue; | |
438 | } | |
439 | ||
a1a31e73 | 440 | dprintk("CHOOSE got %d\n", item); |
5ecc0a0f SW |
441 | out[outpos] = item; |
442 | outpos++; | |
443 | } | |
444 | ||
a1a31e73 | 445 | dprintk("CHOOSE returns %d\n", outpos); |
5ecc0a0f SW |
446 | return outpos; |
447 | } | |
448 | ||
449 | ||
450 | /** | |
451 | * crush_do_rule - calculate a mapping with the given input and rule | |
452 | * @map: the crush_map | |
453 | * @ruleno: the rule id | |
454 | * @x: hash input | |
455 | * @result: pointer to result vector | |
456 | * @result_max: maximum result size | |
457 | * @force: force initial replica choice; -1 for none | |
458 | */ | |
459 | int crush_do_rule(struct crush_map *map, | |
460 | int ruleno, int x, int *result, int result_max, | |
461 | int force, __u32 *weight) | |
462 | { | |
463 | int result_len; | |
464 | int force_context[CRUSH_MAX_DEPTH]; | |
465 | int force_pos = -1; | |
466 | int a[CRUSH_MAX_SET]; | |
467 | int b[CRUSH_MAX_SET]; | |
468 | int c[CRUSH_MAX_SET]; | |
469 | int recurse_to_leaf; | |
470 | int *w; | |
471 | int wsize = 0; | |
472 | int *o; | |
473 | int osize; | |
474 | int *tmp; | |
475 | struct crush_rule *rule; | |
476 | int step; | |
477 | int i, j; | |
478 | int numrep; | |
479 | int firstn; | |
5ecc0a0f SW |
480 | |
481 | BUG_ON(ruleno >= map->max_rules); | |
482 | ||
483 | rule = map->rules[ruleno]; | |
484 | result_len = 0; | |
485 | w = a; | |
486 | o = b; | |
487 | ||
488 | /* | |
489 | * determine hierarchical context of force, if any. note | |
490 | * that this may or may not correspond to the specific types | |
491 | * referenced by the crush rule. | |
492 | */ | |
f1932fc1 SW |
493 | if (force >= 0 && |
494 | force < map->max_devices && | |
495 | map->device_parents[force] != 0 && | |
496 | !is_out(map, weight, force, x)) { | |
497 | while (1) { | |
498 | force_context[++force_pos] = force; | |
499 | if (force >= 0) | |
500 | force = map->device_parents[force]; | |
501 | else | |
502 | force = map->bucket_parents[-1-force]; | |
503 | if (force == 0) | |
504 | break; | |
5ecc0a0f SW |
505 | } |
506 | } | |
507 | ||
508 | for (step = 0; step < rule->len; step++) { | |
509 | firstn = 0; | |
510 | switch (rule->steps[step].op) { | |
511 | case CRUSH_RULE_TAKE: | |
512 | w[0] = rule->steps[step].arg1; | |
e11b05d3 SW |
513 | |
514 | /* find position in force_context/hierarchy */ | |
515 | while (force_pos >= 0 && | |
516 | force_context[force_pos] != w[0]) | |
5ecc0a0f | 517 | force_pos--; |
e11b05d3 SW |
518 | /* and move past it */ |
519 | if (force_pos >= 0) | |
520 | force_pos--; | |
521 | ||
5ecc0a0f SW |
522 | wsize = 1; |
523 | break; | |
524 | ||
525 | case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: | |
526 | case CRUSH_RULE_CHOOSE_FIRSTN: | |
527 | firstn = 1; | |
528 | case CRUSH_RULE_CHOOSE_LEAF_INDEP: | |
529 | case CRUSH_RULE_CHOOSE_INDEP: | |
530 | BUG_ON(wsize == 0); | |
531 | ||
532 | recurse_to_leaf = | |
533 | rule->steps[step].op == | |
534 | CRUSH_RULE_CHOOSE_LEAF_FIRSTN || | |
535 | rule->steps[step].op == | |
536 | CRUSH_RULE_CHOOSE_LEAF_INDEP; | |
537 | ||
538 | /* reset output */ | |
539 | osize = 0; | |
540 | ||
541 | for (i = 0; i < wsize; i++) { | |
542 | /* | |
543 | * see CRUSH_N, CRUSH_N_MINUS macros. | |
544 | * basically, numrep <= 0 means relative to | |
545 | * the provided result_max | |
546 | */ | |
547 | numrep = rule->steps[step].arg1; | |
548 | if (numrep <= 0) { | |
549 | numrep += result_max; | |
550 | if (numrep <= 0) | |
551 | continue; | |
552 | } | |
553 | j = 0; | |
554 | if (osize == 0 && force_pos >= 0) { | |
555 | /* skip any intermediate types */ | |
556 | while (force_pos && | |
557 | force_context[force_pos] < 0 && | |
558 | rule->steps[step].arg2 != | |
559 | map->buckets[-1 - | |
560 | force_context[force_pos]]->type) | |
561 | force_pos--; | |
562 | o[osize] = force_context[force_pos]; | |
563 | if (recurse_to_leaf) | |
564 | c[osize] = force_context[0]; | |
565 | j++; | |
566 | force_pos--; | |
567 | } | |
568 | osize += crush_choose(map, | |
569 | map->buckets[-1-w[i]], | |
570 | weight, | |
571 | x, numrep, | |
572 | rule->steps[step].arg2, | |
573 | o+osize, j, | |
574 | firstn, | |
575 | recurse_to_leaf, c+osize); | |
576 | } | |
577 | ||
578 | if (recurse_to_leaf) | |
579 | /* copy final _leaf_ values to output set */ | |
580 | memcpy(o, c, osize*sizeof(*o)); | |
581 | ||
582 | /* swap t and w arrays */ | |
583 | tmp = o; | |
584 | o = w; | |
585 | w = tmp; | |
586 | wsize = osize; | |
587 | break; | |
588 | ||
589 | ||
590 | case CRUSH_RULE_EMIT: | |
591 | for (i = 0; i < wsize && result_len < result_max; i++) { | |
592 | result[result_len] = w[i]; | |
593 | result_len++; | |
594 | } | |
595 | wsize = 0; | |
596 | break; | |
597 | ||
598 | default: | |
599 | BUG_ON(1); | |
600 | } | |
601 | } | |
f1932fc1 | 602 | return result_len; |
5ecc0a0f SW |
603 | } |
604 | ||
605 |