]> git.proxmox.com Git - ceph.git/blame - ceph/src/crush/mapper.c
import quincy beta 17.1.0
[ceph.git] / ceph / src / crush / mapper.c
CommitLineData
7c673cae
FG
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2015 Intel Corporation All Rights Reserved
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#ifdef __KERNEL__
14# include <linux/string.h>
15# include <linux/slab.h>
16# include <linux/bug.h>
17# include <linux/kernel.h>
18# include <linux/crush/crush.h>
19# include <linux/crush/hash.h>
20#else
21# include "crush_compat.h"
22# include "crush.h"
23# include "hash.h"
24#endif
25#include "crush_ln_table.h"
26#include "mapper.h"
27
28#define dprintk(args...) /* printf(args) */
29
30/*
31 * Implement the core CRUSH mapping algorithm.
32 */
33
7c673cae
FG
34/*
35 * bucket choose methods
36 *
37 * For each bucket algorithm, we have a "choose" method that, given a
38 * crush input @x and replica position (usually, position in output set) @r,
39 * will produce an item in the bucket.
40 */
41
42/*
43 * Choose based on a random permutation of the bucket.
44 *
45 * We used to use some prime number arithmetic to do this, but it
46 * wasn't very random, and had some other bad behaviors. Instead, we
47 * calculate an actual random permutation of the bucket members.
48 * Since this is expensive, we optimize for the r=0 case, which
49 * captures the vast majority of calls.
50 */
51static int bucket_perm_choose(const struct crush_bucket *bucket,
52 struct crush_work_bucket *work,
53 int x, int r)
54{
55 unsigned int pr = r % bucket->size;
56 unsigned int i, s;
57
58 /* start a new permutation if @x has changed */
59 if (work->perm_x != (__u32)x || work->perm_n == 0) {
60 dprintk("bucket %d new x=%d\n", bucket->id, x);
61 work->perm_x = x;
62
63 /* optimize common r=0 case */
64 if (pr == 0) {
65 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
66 bucket->size;
67 work->perm[0] = s;
68 work->perm_n = 0xffff; /* magic value, see below */
69 goto out;
70 }
71
72 for (i = 0; i < bucket->size; i++)
73 work->perm[i] = i;
74 work->perm_n = 0;
75 } else if (work->perm_n == 0xffff) {
76 /* clean up after the r=0 case above */
77 for (i = 1; i < bucket->size; i++)
78 work->perm[i] = i;
79 work->perm[work->perm[0]] = 0;
80 work->perm_n = 1;
81 }
82
83 /* calculate permutation up to pr */
84 for (i = 0; i < work->perm_n; i++)
85 dprintk(" perm_choose have %d: %d\n", i, work->perm[i]);
86 while (work->perm_n <= pr) {
87 unsigned int p = work->perm_n;
88 /* no point in swapping the final entry */
89 if (p < bucket->size - 1) {
90 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
91 (bucket->size - p);
92 if (i) {
93 unsigned int t = work->perm[p + i];
94 work->perm[p + i] = work->perm[p];
95 work->perm[p] = t;
96 }
97 dprintk(" perm_choose swap %d with %d\n", p, p+i);
98 }
99 work->perm_n++;
100 }
101 for (i = 0; i < bucket->size; i++)
102 dprintk(" perm_choose %d: %d\n", i, work->perm[i]);
103
104 s = work->perm[pr];
105out:
106 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
107 bucket->size, x, r, pr, s);
108 return bucket->items[s];
109}
110
111/* uniform */
112static int bucket_uniform_choose(const struct crush_bucket_uniform *bucket,
113 struct crush_work_bucket *work, int x, int r)
114{
115 return bucket_perm_choose(&bucket->h, work, x, r);
116}
117
118/* list */
119static int bucket_list_choose(const struct crush_bucket_list *bucket,
120 int x, int r)
121{
122 int i;
123
124 for (i = bucket->h.size-1; i >= 0; i--) {
125 __u64 w = crush_hash32_4(bucket->h.hash, x, bucket->h.items[i],
126 r, bucket->h.id);
127 w &= 0xffff;
128 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
129 "sw %x rand %llx",
130 i, x, r, bucket->h.items[i], bucket->item_weights[i],
131 bucket->sum_weights[i], w);
132 w *= bucket->sum_weights[i];
133 w = w >> 16;
134 /*dprintk(" scaled %llx\n", w);*/
135 if (w < bucket->item_weights[i]) {
136 return bucket->h.items[i];
137 }
138 }
139
140 dprintk("bad list sums for bucket %d\n", bucket->h.id);
141 return bucket->h.items[0];
142}
143
144
145/* (binary) tree */
146static int height(int n)
147{
148 int h = 0;
149 while ((n & 1) == 0) {
150 h++;
151 n = n >> 1;
152 }
153 return h;
154}
155
156static int left(int x)
157{
158 int h = height(x);
159 return x - (1 << (h-1));
160}
161
162static int right(int x)
163{
164 int h = height(x);
165 return x + (1 << (h-1));
166}
167
168static int terminal(int x)
169{
170 return x & 1;
171}
172
173static int bucket_tree_choose(const struct crush_bucket_tree *bucket,
174 int x, int r)
175{
176 int n;
177 __u32 w;
178 __u64 t;
179
180 /* start at root */
181 n = bucket->num_nodes >> 1;
182
183 while (!terminal(n)) {
184 int l;
185 /* pick point in [0, w) */
186 w = bucket->node_weights[n];
187 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
188 bucket->h.id) * (__u64)w;
189 t = t >> 32;
190
191 /* descend to the left or right? */
192 l = left(n);
193 if (t < bucket->node_weights[l])
194 n = l;
195 else
196 n = right(n);
197 }
198
199 return bucket->h.items[n >> 1];
200}
201
202
203/* straw */
204
205static int bucket_straw_choose(const struct crush_bucket_straw *bucket,
206 int x, int r)
207{
208 __u32 i;
209 int high = 0;
210 __u64 high_draw = 0;
211 __u64 draw;
212
213 for (i = 0; i < bucket->h.size; i++) {
214 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
215 draw &= 0xffff;
216 draw *= bucket->straws[i];
217 if (i == 0 || draw > high_draw) {
218 high = i;
219 high_draw = draw;
220 }
221 }
222 return bucket->h.items[high];
223}
224
225/* compute 2^44*log2(input+1) */
226static __u64 crush_ln(unsigned int xin)
227{
228 unsigned int x = xin;
229 int iexpon, index1, index2;
230 __u64 RH, LH, LL, xl64, result;
231
232 x++;
233
234 /* normalize input */
235 iexpon = 15;
236
237 // figure out number of bits we need to shift and
238 // do it in one step instead of iteratively
239 if (!(x & 0x18000)) {
240 int bits = __builtin_clz(x & 0x1FFFF) - 16;
241 x <<= bits;
242 iexpon = 15 - bits;
243 }
244
245 index1 = (x >> 8) << 1;
246 /* RH ~ 2^56/index1 */
247 RH = __RH_LH_tbl[index1 - 256];
248 /* LH ~ 2^48 * log2(index1/256) */
249 LH = __RH_LH_tbl[index1 + 1 - 256];
250
251 /* RH*x ~ 2^48 * (2^15 + xf), xf<2^8 */
252 xl64 = (__s64)x * RH;
253 xl64 >>= 48;
254
255 result = iexpon;
256 result <<= (12 + 32);
257
258 index2 = xl64 & 0xff;
259 /* LL ~ 2^48*log2(1.0+index2/2^15) */
260 LL = __LL_tbl[index2];
261
262 LH = LH + LL;
263
264 LH >>= (48 - 12 - 32);
265 result += LH;
266
267 return result;
268}
269
270
271/*
272 * straw2
273 *
11fdf7f2
TL
274 * Suppose we have two osds: osd.0 and osd.1, with weight 8 and 4 respectively, It means:
275 * a). For osd.0, the time interval between each io request apply to exponential distribution
276 * with lamba equals 8
277 * b). For osd.1, the time interval between each io request apply to exponential distribution
278 * with lamba equals 4
279 * c). If we apply to each osd's exponential random variable, then the total pgs on each osd
280 * is proportional to its weight.
281 *
7c673cae
FG
282 * for reference, see:
283 *
284 * http://en.wikipedia.org/wiki/Exponential_distribution#Distribution_of_the_minimum_of_exponential_random_variables
7c673cae
FG
285 */
286
287static inline __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
288 const struct crush_choose_arg *arg,
289 int position)
290{
11fdf7f2
TL
291 if ((arg == NULL) || (arg->weight_set == NULL))
292 return bucket->item_weights;
293 if (position >= arg->weight_set_positions)
294 position = arg->weight_set_positions - 1;
295 return arg->weight_set[position].weights;
7c673cae
FG
296}
297
224ce89b
WB
298static inline __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
299 const struct crush_choose_arg *arg)
7c673cae 300{
11fdf7f2
TL
301 if ((arg == NULL) || (arg->ids == NULL))
302 return bucket->h.items;
303 return arg->ids;
304}
305
306/*
307 * Compute exponential random variable using inversion method.
308 *
309 * for reference, see the exponential distribution example at:
310 * https://en.wikipedia.org/wiki/Inverse_transform_sampling#Examples
311 */
312static inline __s64 generate_exponential_distribution(int type, int x, int y, int z,
313 int weight)
314{
315 unsigned int u = crush_hash32_3(type, x, y, z);
316 u &= 0xffff;
317
318 /*
319 * for some reason slightly less than 0x10000 produces
320 * a slightly more accurate distribution... probably a
321 * rounding effect.
322 *
323 * the natural log lookup table maps [0,0xffff]
324 * (corresponding to real numbers [1/0x10000, 1] to
325 * [0, 0xffffffffffff] (corresponding to real numbers
326 * [-11.090355,0]).
327 */
328 __s64 ln = crush_ln(u) - 0x1000000000000ll;
329
330 /*
331 * divide by 16.16 fixed-point weight. note
332 * that the ln value is negative, so a larger
333 * weight means a larger (less negative) value
334 * for draw.
335 */
336 return div64_s64(ln, weight);
7c673cae
FG
337}
338
339static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
340 int x, int r, const struct crush_choose_arg *arg,
341 int position)
342{
343 unsigned int i, high = 0;
11fdf7f2 344 __s64 draw, high_draw = 0;
7c673cae 345 __u32 *weights = get_choose_arg_weights(bucket, arg, position);
224ce89b 346 __s32 *ids = get_choose_arg_ids(bucket, arg);
7c673cae
FG
347 for (i = 0; i < bucket->h.size; i++) {
348 dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
349 if (weights[i]) {
11fdf7f2 350 draw = generate_exponential_distribution(bucket->h.hash, x, ids[i], r, weights[i]);
7c673cae
FG
351 } else {
352 draw = S64_MIN;
353 }
354
355 if (i == 0 || draw > high_draw) {
356 high = i;
357 high_draw = draw;
358 }
359 }
360
361 return bucket->h.items[high];
362}
363
364
365static int crush_bucket_choose(const struct crush_bucket *in,
366 struct crush_work_bucket *work,
367 int x, int r,
368 const struct crush_choose_arg *arg,
369 int position)
370{
371 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
372 BUG_ON(in->size == 0);
373 switch (in->alg) {
374 case CRUSH_BUCKET_UNIFORM:
375 return bucket_uniform_choose(
376 (const struct crush_bucket_uniform *)in,
377 work, x, r);
378 case CRUSH_BUCKET_LIST:
379 return bucket_list_choose((const struct crush_bucket_list *)in,
380 x, r);
381 case CRUSH_BUCKET_TREE:
382 return bucket_tree_choose((const struct crush_bucket_tree *)in,
383 x, r);
384 case CRUSH_BUCKET_STRAW:
385 return bucket_straw_choose(
386 (const struct crush_bucket_straw *)in,
387 x, r);
388 case CRUSH_BUCKET_STRAW2:
389 return bucket_straw2_choose(
390 (const struct crush_bucket_straw2 *)in,
391 x, r, arg, position);
392 default:
393 dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
394 return in->items[0];
395 }
396}
397
398/*
399 * true if device is marked "out" (failed, fully offloaded)
400 * of the cluster
401 */
402static int is_out(const struct crush_map *map,
403 const __u32 *weight, int weight_max,
404 int item, int x)
405{
406 if (item >= weight_max)
407 return 1;
408 if (weight[item] >= 0x10000)
409 return 0;
410 if (weight[item] == 0)
411 return 1;
412 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
413 < weight[item])
414 return 0;
415 return 1;
416}
417
418/**
419 * crush_choose_firstn - choose numrep distinct items of given type
420 * @map: the crush_map
421 * @bucket: the bucket we are choose an item from
422 * @x: crush input value
423 * @numrep: the number of items to choose
424 * @type: the type of item to choose
425 * @out: pointer to output vector
426 * @outpos: our position in that vector
427 * @out_size: size of the out vector
428 * @tries: number of attempts to make
429 * @recurse_tries: number of attempts to have recursive chooseleaf make
430 * @local_retries: localized retries
431 * @local_fallback_retries: localized fallback retries
432 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
433 * @stable: stable mode starts rep=0 in the recursive call for all replicas
434 * @vary_r: pass r to recursive calls
435 * @out2: second output vector for leaf items (if @recurse_to_leaf)
436 * @parent_r: r value passed from the parent
437 */
438static int crush_choose_firstn(const struct crush_map *map,
439 struct crush_work *work,
440 const struct crush_bucket *bucket,
441 const __u32 *weight, int weight_max,
442 int x, int numrep, int type,
443 int *out, int outpos,
444 int out_size,
445 unsigned int tries,
446 unsigned int recurse_tries,
447 unsigned int local_retries,
448 unsigned int local_fallback_retries,
449 int recurse_to_leaf,
450 unsigned int vary_r,
451 unsigned int stable,
452 int *out2,
453 int parent_r,
454 const struct crush_choose_arg *choose_args)
455{
456 int rep;
457 unsigned int ftotal, flocal;
458 int retry_descent, retry_bucket, skip_rep;
459 const struct crush_bucket *in = bucket;
460 int r;
461 int i;
462 int item = 0;
463 int itemtype;
464 int collide, reject;
465 int count = out_size;
466
467 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d \
468recurse_tries %d local_retries %d local_fallback_retries %d \
469parent_r %d stable %d\n",
470 recurse_to_leaf ? "_LEAF" : "",
471 bucket->id, x, outpos, numrep,
472 tries, recurse_tries, local_retries, local_fallback_retries,
473 parent_r, stable);
474
475 for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
476 /* keep trying until we get a non-out, non-colliding item */
477 ftotal = 0;
478 skip_rep = 0;
479 do {
480 retry_descent = 0;
481 in = bucket; /* initial bucket */
482
483 /* choose through intervening buckets */
484 flocal = 0;
485 do {
486 collide = 0;
487 retry_bucket = 0;
488 r = rep + parent_r;
489 /* r' = r + f_total */
490 r += ftotal;
491
492 /* bucket choose */
493 if (in->size == 0) {
494 reject = 1;
495 goto reject;
496 }
497 if (local_fallback_retries > 0 &&
498 flocal >= (in->size>>1) &&
499 flocal > local_fallback_retries)
500 item = bucket_perm_choose(
501 in, work->work[-1-in->id],
502 x, r);
503 else
504 item = crush_bucket_choose(
505 in, work->work[-1-in->id],
506 x, r,
507 (choose_args ? &choose_args[-1-in->id] : 0),
508 outpos);
509 if (item >= map->max_devices) {
510 dprintk(" bad item %d\n", item);
511 skip_rep = 1;
512 break;
513 }
514
515 /* desired type? */
516 if (item < 0)
517 itemtype = map->buckets[-1-item]->type;
518 else
519 itemtype = 0;
520 dprintk(" item %d type %d\n", item, itemtype);
521
522 /* keep going? */
523 if (itemtype != type) {
524 if (item >= 0 ||
525 (-1-item) >= map->max_buckets) {
526 dprintk(" bad item type %d\n", type);
527 skip_rep = 1;
528 break;
529 }
530 in = map->buckets[-1-item];
531 retry_bucket = 1;
532 continue;
533 }
534
535 /* collision? */
536 for (i = 0; i < outpos; i++) {
537 if (out[i] == item) {
538 collide = 1;
539 break;
540 }
541 }
542
543 reject = 0;
544 if (!collide && recurse_to_leaf) {
545 if (item < 0) {
546 int sub_r;
547 if (vary_r)
548 sub_r = r >> (vary_r-1);
549 else
550 sub_r = 0;
551 if (crush_choose_firstn(
552 map,
553 work,
554 map->buckets[-1-item],
555 weight, weight_max,
556 x, stable ? 1 : outpos+1, 0,
557 out2, outpos, count,
558 recurse_tries, 0,
559 local_retries,
560 local_fallback_retries,
561 0,
562 vary_r,
563 stable,
564 NULL,
565 sub_r,
566 choose_args) <= outpos)
567 /* didn't get leaf */
568 reject = 1;
569 } else {
570 /* we already have a leaf! */
571 out2[outpos] = item;
572 }
573 }
574
575 if (!reject && !collide) {
576 /* out? */
577 if (itemtype == 0)
578 reject = is_out(map, weight,
579 weight_max,
580 item, x);
581 }
582
583reject:
584 if (reject || collide) {
585 ftotal++;
586 flocal++;
587
588 if (collide && flocal <= local_retries)
589 /* retry locally a few times */
590 retry_bucket = 1;
591 else if (local_fallback_retries > 0 &&
592 flocal <= in->size + local_fallback_retries)
593 /* exhaustive bucket search */
594 retry_bucket = 1;
595 else if (ftotal < tries)
596 /* then retry descent */
597 retry_descent = 1;
598 else
599 /* else give up */
600 skip_rep = 1;
601 dprintk(" reject %d collide %d "
602 "ftotal %u flocal %u\n",
603 reject, collide, ftotal,
604 flocal);
605 }
606 } while (retry_bucket);
607 } while (retry_descent);
608
609 if (skip_rep) {
610 dprintk("skip rep\n");
611 continue;
612 }
613
614 dprintk("CHOOSE got %d\n", item);
615 out[outpos] = item;
616 outpos++;
617 count--;
618#ifndef __KERNEL__
619 if (map->choose_tries && ftotal <= map->choose_total_tries)
620 map->choose_tries[ftotal]++;
621#endif
622 }
623
624 dprintk("CHOOSE returns %d\n", outpos);
625 return outpos;
626}
627
628
629/**
630 * crush_choose_indep: alternative breadth-first positionally stable mapping
631 *
632 */
633static void crush_choose_indep(const struct crush_map *map,
634 struct crush_work *work,
635 const struct crush_bucket *bucket,
636 const __u32 *weight, int weight_max,
637 int x, int left, int numrep, int type,
638 int *out, int outpos,
639 unsigned int tries,
640 unsigned int recurse_tries,
641 int recurse_to_leaf,
642 int *out2,
643 int parent_r,
644 const struct crush_choose_arg *choose_args)
645{
646 const struct crush_bucket *in = bucket;
647 int endpos = outpos + left;
648 int rep;
649 unsigned int ftotal;
650 int r;
651 int i;
652 int item = 0;
653 int itemtype;
654 int collide;
655
656 dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
657 bucket->id, x, outpos, numrep);
658
659 /* initially my result is undefined */
660 for (rep = outpos; rep < endpos; rep++) {
661 out[rep] = CRUSH_ITEM_UNDEF;
662 if (out2)
663 out2[rep] = CRUSH_ITEM_UNDEF;
664 }
665
666 for (ftotal = 0; left > 0 && ftotal < tries; ftotal++) {
667#ifdef DEBUG_INDEP
668 if (out2 && ftotal) {
669 dprintk("%u %d a: ", ftotal, left);
670 for (rep = outpos; rep < endpos; rep++) {
671 dprintk(" %d", out[rep]);
672 }
673 dprintk("\n");
674 dprintk("%u %d b: ", ftotal, left);
675 for (rep = outpos; rep < endpos; rep++) {
676 dprintk(" %d", out2[rep]);
677 }
678 dprintk("\n");
679 }
680#endif
681 for (rep = outpos; rep < endpos; rep++) {
682 if (out[rep] != CRUSH_ITEM_UNDEF)
683 continue;
684
685 in = bucket; /* initial bucket */
686
687 /* choose through intervening buckets */
688 for (;;) {
689 /* note: we base the choice on the position
690 * even in the nested call. that means that
691 * if the first layer chooses the same bucket
692 * in a different position, we will tend to
693 * choose a different item in that bucket.
694 * this will involve more devices in data
695 * movement and tend to distribute the load.
696 */
697 r = rep + parent_r;
698
699 /* be careful */
700 if (in->alg == CRUSH_BUCKET_UNIFORM &&
701 in->size % numrep == 0)
702 /* r'=r+(n+1)*f_total */
703 r += (numrep+1) * ftotal;
704 else
705 /* r' = r + n*f_total */
706 r += numrep * ftotal;
707
708 /* bucket choose */
709 if (in->size == 0) {
710 dprintk(" empty bucket\n");
711 break;
712 }
713
714 item = crush_bucket_choose(
715 in, work->work[-1-in->id],
716 x, r,
717 (choose_args ? &choose_args[-1-in->id] : 0),
718 outpos);
719 if (item >= map->max_devices) {
720 dprintk(" bad item %d\n", item);
721 out[rep] = CRUSH_ITEM_NONE;
722 if (out2)
723 out2[rep] = CRUSH_ITEM_NONE;
724 left--;
725 break;
726 }
727
728 /* desired type? */
729 if (item < 0)
730 itemtype = map->buckets[-1-item]->type;
731 else
732 itemtype = 0;
733 dprintk(" item %d type %d\n", item, itemtype);
734
735 /* keep going? */
736 if (itemtype != type) {
737 if (item >= 0 ||
738 (-1-item) >= map->max_buckets) {
739 dprintk(" bad item type %d\n", type);
740 out[rep] = CRUSH_ITEM_NONE;
741 if (out2)
742 out2[rep] =
743 CRUSH_ITEM_NONE;
744 left--;
745 break;
746 }
747 in = map->buckets[-1-item];
748 continue;
749 }
750
751 /* collision? */
752 collide = 0;
753 for (i = outpos; i < endpos; i++) {
754 if (out[i] == item) {
755 collide = 1;
756 break;
757 }
758 }
759 if (collide)
760 break;
761
762 if (recurse_to_leaf) {
763 if (item < 0) {
764 crush_choose_indep(
765 map,
766 work,
767 map->buckets[-1-item],
768 weight, weight_max,
769 x, 1, numrep, 0,
770 out2, rep,
771 recurse_tries, 0,
772 0, NULL, r, choose_args);
9f95a23c 773 if (out2 && out2[rep] == CRUSH_ITEM_NONE) {
7c673cae
FG
774 /* placed nothing; no leaf */
775 break;
776 }
9f95a23c 777 } else if (out2) {
7c673cae
FG
778 /* we already have a leaf! */
779 out2[rep] = item;
780 }
781 }
782
783 /* out? */
784 if (itemtype == 0 &&
785 is_out(map, weight, weight_max, item, x))
786 break;
787
788 /* yay! */
789 out[rep] = item;
790 left--;
791 break;
792 }
793 }
794 }
795 for (rep = outpos; rep < endpos; rep++) {
796 if (out[rep] == CRUSH_ITEM_UNDEF) {
797 out[rep] = CRUSH_ITEM_NONE;
798 }
799 if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
800 out2[rep] = CRUSH_ITEM_NONE;
801 }
802 }
803#ifndef __KERNEL__
804 if (map->choose_tries && ftotal <= map->choose_total_tries)
805 map->choose_tries[ftotal]++;
806#endif
807#ifdef DEBUG_INDEP
808 if (out2) {
809 dprintk("%u %d a: ", ftotal, left);
810 for (rep = outpos; rep < endpos; rep++) {
811 dprintk(" %d", out[rep]);
812 }
813 dprintk("\n");
814 dprintk("%u %d b: ", ftotal, left);
815 for (rep = outpos; rep < endpos; rep++) {
816 dprintk(" %d", out2[rep]);
817 }
818 dprintk("\n");
819 }
820#endif
821}
822
823
824/* This takes a chunk of memory and sets it up to be a shiny new
825 working area for a CRUSH placement computation. It must be called
826 on any newly allocated memory before passing it in to
827 crush_do_rule. It may be used repeatedly after that, so long as the
828 map has not changed. If the map /has/ changed, you must make sure
829 the working size is no smaller than what was allocated and re-run
830 crush_init_workspace.
831
832 If you do retain the working space between calls to crush, make it
833 thread-local. If you reinstitute the locking I've spent so much
834 time getting rid of, I will be very unhappy with you. */
835
836void crush_init_workspace(const struct crush_map *m, void *v) {
837 /* We work by moving through the available space and setting
838 values and pointers as we go.
839
840 It's a bit like Forth's use of the 'allot' word since we
841 set the pointer first and then reserve the space for it to
842 point to by incrementing the point. */
843 struct crush_work *w = (struct crush_work *)v;
844 char *point = (char *)v;
845 __s32 b;
846 point += sizeof(struct crush_work);
847 w->work = (struct crush_work_bucket **)point;
848 point += m->max_buckets * sizeof(struct crush_work_bucket *);
849 for (b = 0; b < m->max_buckets; ++b) {
850 if (m->buckets[b] == 0)
851 continue;
852
853 w->work[b] = (struct crush_work_bucket *) point;
854 switch (m->buckets[b]->alg) {
855 default:
856 point += sizeof(struct crush_work_bucket);
857 break;
858 }
859 w->work[b]->perm_x = 0;
860 w->work[b]->perm_n = 0;
861 w->work[b]->perm = (__u32 *)point;
862 point += m->buckets[b]->size * sizeof(__u32);
863 }
864 BUG_ON((char *)point - (char *)w != m->working_size);
865}
866
867/**
868 * crush_do_rule - calculate a mapping with the given input and rule
869 * @map: the crush_map
870 * @ruleno: the rule id
871 * @x: hash input
872 * @result: pointer to result vector
873 * @result_max: maximum result size
874 * @weight: weight vector (for map leaves)
875 * @weight_max: size of weight vector
876 * @cwin: Pointer to at least map->working_size bytes of memory or NULL.
877 */
878int crush_do_rule(const struct crush_map *map,
879 int ruleno, int x, int *result, int result_max,
880 const __u32 *weight, int weight_max,
881 void *cwin, const struct crush_choose_arg *choose_args)
882{
883 int result_len;
884 struct crush_work *cw = cwin;
885 int *a = (int *)((char *)cw + map->working_size);
886 int *b = a + result_max;
887 int *c = b + result_max;
888 int *w = a;
889 int *o = b;
890 int recurse_to_leaf;
891 int wsize = 0;
892 int osize;
893 int *tmp;
894 const struct crush_rule *rule;
895 __u32 step;
896 int i, j;
897 int numrep;
898 int out_size;
899 /*
900 * the original choose_total_tries value was off by one (it
901 * counted "retries" and not "tries"). add one.
902 */
903 int choose_tries = map->choose_total_tries + 1;
904 int choose_leaf_tries = 0;
905 /*
906 * the local tries values were counted as "retries", though,
907 * and need no adjustment
908 */
909 int choose_local_retries = map->choose_local_tries;
910 int choose_local_fallback_retries = map->choose_local_fallback_tries;
911
912 int vary_r = map->chooseleaf_vary_r;
913 int stable = map->chooseleaf_stable;
914
915 if ((__u32)ruleno >= map->max_rules) {
916 dprintk(" bad ruleno %d\n", ruleno);
917 return 0;
918 }
919
920 rule = map->rules[ruleno];
921 result_len = 0;
922
923 for (step = 0; step < rule->len; step++) {
924 int firstn = 0;
925 const struct crush_rule_step *curstep = &rule->steps[step];
926
927 switch (curstep->op) {
928 case CRUSH_RULE_TAKE:
929 if ((curstep->arg1 >= 0 &&
930 curstep->arg1 < map->max_devices) ||
931 (-1-curstep->arg1 >= 0 &&
932 -1-curstep->arg1 < map->max_buckets &&
933 map->buckets[-1-curstep->arg1])) {
934 w[0] = curstep->arg1;
935 wsize = 1;
936 } else {
937 dprintk(" bad take value %d\n", curstep->arg1);
938 }
939 break;
940
941 case CRUSH_RULE_SET_CHOOSE_TRIES:
942 if (curstep->arg1 > 0)
943 choose_tries = curstep->arg1;
944 break;
945
946 case CRUSH_RULE_SET_CHOOSELEAF_TRIES:
947 if (curstep->arg1 > 0)
948 choose_leaf_tries = curstep->arg1;
949 break;
950
951 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
952 if (curstep->arg1 >= 0)
953 choose_local_retries = curstep->arg1;
954 break;
955
956 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
957 if (curstep->arg1 >= 0)
958 choose_local_fallback_retries = curstep->arg1;
959 break;
960
961 case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
962 if (curstep->arg1 >= 0)
963 vary_r = curstep->arg1;
964 break;
965
966 case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
967 if (curstep->arg1 >= 0)
968 stable = curstep->arg1;
969 break;
970
971 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
972 case CRUSH_RULE_CHOOSE_FIRSTN:
973 firstn = 1;
974 /* fall through */
975 case CRUSH_RULE_CHOOSELEAF_INDEP:
976 case CRUSH_RULE_CHOOSE_INDEP:
977 if (wsize == 0)
978 break;
979
980 recurse_to_leaf =
981 curstep->op ==
982 CRUSH_RULE_CHOOSELEAF_FIRSTN ||
983 curstep->op ==
984 CRUSH_RULE_CHOOSELEAF_INDEP;
985
986 /* reset output */
987 osize = 0;
988
989 for (i = 0; i < wsize; i++) {
990 int bno;
991 numrep = curstep->arg1;
992 if (numrep <= 0) {
993 numrep += result_max;
994 if (numrep <= 0)
995 continue;
996 }
997 j = 0;
998 /* make sure bucket id is valid */
999 bno = -1 - w[i];
1000 if (bno < 0 || bno >= map->max_buckets) {
1001 // w[i] is probably CRUSH_ITEM_NONE
1002 dprintk(" bad w[i] %d\n", w[i]);
1003 continue;
1004 }
1005 if (firstn) {
1006 int recurse_tries;
1007 if (choose_leaf_tries)
1008 recurse_tries =
1009 choose_leaf_tries;
1010 else if (map->chooseleaf_descend_once)
1011 recurse_tries = 1;
1012 else
1013 recurse_tries = choose_tries;
1014 osize += crush_choose_firstn(
1015 map,
1016 cw,
1017 map->buckets[bno],
1018 weight, weight_max,
1019 x, numrep,
1020 curstep->arg2,
1021 o+osize, j,
1022 result_max-osize,
1023 choose_tries,
1024 recurse_tries,
1025 choose_local_retries,
1026 choose_local_fallback_retries,
1027 recurse_to_leaf,
1028 vary_r,
1029 stable,
1030 c+osize,
1031 0,
1032 choose_args);
1033 } else {
1034 out_size = ((numrep < (result_max-osize)) ?
1035 numrep : (result_max-osize));
1036 crush_choose_indep(
1037 map,
1038 cw,
1039 map->buckets[bno],
1040 weight, weight_max,
1041 x, out_size, numrep,
1042 curstep->arg2,
1043 o+osize, j,
1044 choose_tries,
1045 choose_leaf_tries ?
1046 choose_leaf_tries : 1,
1047 recurse_to_leaf,
1048 c+osize,
1049 0,
1050 choose_args);
1051 osize += out_size;
1052 }
1053 }
1054
1055 if (recurse_to_leaf)
1056 /* copy final _leaf_ values to output set */
1057 memcpy(o, c, osize*sizeof(*o));
1058
1059 /* swap o and w arrays */
1060 tmp = o;
1061 o = w;
1062 w = tmp;
1063 wsize = osize;
1064 break;
1065
1066
1067 case CRUSH_RULE_EMIT:
1068 for (i = 0; i < wsize && result_len < result_max; i++) {
1069 result[result_len] = w[i];
1070 result_len++;
1071 }
1072 wsize = 0;
1073 break;
1074
1075 default:
1076 dprintk(" unknown op %d at step %d\n",
1077 curstep->op, step);
1078 break;
1079 }
1080 }
1081
1082 return result_len;
1083}