1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "include/stringify.h"
5 #include "CrushTester.h"
6 #include "CrushTreeDumper.h"
7 #include "include/ceph_features.h"
11 #include <boost/lexical_cast.hpp>
12 // to workaround https://svn.boost.org/trac/boost/ticket/9501
13 #ifdef _LIBCPP_VERSION
14 #include <boost/version.hpp>
15 #if BOOST_VERSION < 105600
16 #define ICL_USE_BOOST_MOVE_IMPLEMENTATION
19 #include <boost/icl/interval_map.hpp>
20 #include <boost/algorithm/string/join.hpp>
21 #include "common/SubProcess.h"
22 #include "common/fork_function.h"
24 void CrushTester::set_device_weight(int dev
, float f
)
26 int w
= (int)(f
* 0x10000);
31 device_weight
[dev
] = w
;
34 int CrushTester::get_maximum_affected_by_rule(int ruleno
)
36 // get the number of steps in RULENO
37 int rule_size
= crush
.get_rule_len(ruleno
);
38 vector
<int> affected_types
;
39 map
<int,int> replications_by_type
;
41 for (int i
= 0; i
< rule_size
; i
++){
42 // get what operation is done by the current step
43 int rule_operation
= crush
.get_rule_op(ruleno
, i
);
45 // if the operation specifies choosing a device type, store it
46 if (rule_operation
>= 2 && rule_operation
!= 4){
47 int desired_replication
= crush
.get_rule_arg1(ruleno
,i
);
48 int affected_type
= crush
.get_rule_arg2(ruleno
,i
);
49 affected_types
.push_back(affected_type
);
50 replications_by_type
[affected_type
] = desired_replication
;
55 * now for each of the affected bucket types, see what is the
56 * maximum we are (a) requesting or (b) have
59 map
<int,int> max_devices_of_type
;
61 // loop through the vector of affected types
62 for (vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
63 // loop through the number of buckets looking for affected types
64 for (map
<int,string
>::iterator p
= crush
.name_map
.begin(); p
!= crush
.name_map
.end(); ++p
){
65 int bucket_type
= crush
.get_bucket_type(p
->first
);
66 if ( bucket_type
== *it
)
67 max_devices_of_type
[*it
]++;
71 for(std::vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
72 if ( replications_by_type
[*it
] > 0 && replications_by_type
[*it
] < max_devices_of_type
[*it
] )
73 max_devices_of_type
[*it
] = replications_by_type
[*it
];
77 * get the smallest number of buckets available of any type as this is our upper bound on
78 * the number of replicas we can place
80 int max_affected
= max( crush
.get_max_buckets(), crush
.get_max_devices() );
82 for(std::vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
83 if (max_devices_of_type
[*it
] > 0 && max_devices_of_type
[*it
] < max_affected
)
84 max_affected
= max_devices_of_type
[*it
];
91 map
<int,int> CrushTester::get_collapsed_mapping()
93 int num_to_check
= crush
.get_max_devices();
95 map
<int, int> collapse_mask
;
97 for (int i
= 0; i
< num_to_check
; i
++){
98 if (crush
.check_item_present(i
)){
99 collapse_mask
[i
] = next_id
;
104 return collapse_mask
;
107 void CrushTester::adjust_weights(vector
<__u32
>& weight
)
110 if (mark_down_device_ratio
> 0) {
112 vector
<int> bucket_ids
;
113 for (int i
= 0; i
< crush
.get_max_buckets(); i
++) {
115 if (crush
.get_bucket_weight(id
) > 0) {
116 bucket_ids
.push_back(id
);
120 // get buckets that are one level above a device
121 vector
<int> buckets_above_devices
;
122 for (unsigned i
= 0; i
< bucket_ids
.size(); i
++) {
123 // grab the first child object of a bucket and check if it's ID is less than 0
124 int id
= bucket_ids
[i
];
125 if (crush
.get_bucket_size(id
) == 0)
127 int first_child
= crush
.get_bucket_item(id
, 0); // returns the ID of the bucket or device
128 if (first_child
>= 0) {
129 buckets_above_devices
.push_back(id
);
133 // permute bucket list
134 for (unsigned i
= 0; i
< buckets_above_devices
.size(); i
++) {
135 unsigned j
= lrand48() % (buckets_above_devices
.size() - 1);
136 std::swap(buckets_above_devices
[i
], buckets_above_devices
[j
]);
139 // calculate how many buckets and devices we need to reap...
140 int num_buckets_to_visit
= (int) (mark_down_bucket_ratio
* buckets_above_devices
.size());
142 for (int i
= 0; i
< num_buckets_to_visit
; i
++) {
143 int id
= buckets_above_devices
[i
];
144 int size
= crush
.get_bucket_size(id
);
146 for (int o
= 0; o
< size
; o
++)
147 items
.push_back(crush
.get_bucket_item(id
, o
));
150 for (int o
= 0; o
< size
; o
++) {
151 int j
= lrand48() % (crush
.get_bucket_size(id
) - 1);
152 std::swap(items
[o
], items
[j
]);
155 int local_devices_to_visit
= (int) (mark_down_device_ratio
*size
);
156 for (int o
= 0; o
< local_devices_to_visit
; o
++){
157 int item
= crush
.get_bucket_item(id
, o
);
164 bool CrushTester::check_valid_placement(int ruleno
, vector
<int> in
, const vector
<__u32
>& weight
)
167 bool valid_placement
= true;
168 vector
<int> included_devices
;
169 map
<string
,string
> seen_devices
;
171 // first do the easy check that all devices are "up"
172 for (vector
<int>::iterator it
= in
.begin(); it
!= in
.end(); ++it
) {
173 if (weight
[(*it
)] == 0) {
174 valid_placement
= false;
176 } else if (weight
[(*it
)] > 0) {
177 included_devices
.push_back( (*it
) );
182 * now do the harder test of checking that the CRUSH rule r is not violated
183 * we could test that none of the devices mentioned in out are unique,
184 * but this is a special case of this test
187 // get the number of steps in RULENO
188 int rule_size
= crush
.get_rule_len(ruleno
);
189 vector
<string
> affected_types
;
191 // get the smallest type id, and name
192 int min_map_type
= crush
.get_num_type_names();
193 for (map
<int,string
>::iterator it
= crush
.type_map
.begin(); it
!= crush
.type_map
.end(); ++it
) {
194 if ( (*it
).first
< min_map_type
) {
195 min_map_type
= (*it
).first
;
199 string min_map_type_name
= crush
.type_map
[min_map_type
];
201 // get the types of devices affected by RULENO
202 for (int i
= 0; i
< rule_size
; i
++) {
203 // get what operation is done by the current step
204 int rule_operation
= crush
.get_rule_op(ruleno
, i
);
206 // if the operation specifies choosing a device type, store it
207 if (rule_operation
>= 2 && rule_operation
!= 4) {
208 int affected_type
= crush
.get_rule_arg2(ruleno
,i
);
209 affected_types
.push_back( crush
.get_type_name(affected_type
));
213 // find in if we are only dealing with osd's
214 bool only_osd_affected
= false;
215 if (affected_types
.size() == 1) {
216 if ((affected_types
.back() == min_map_type_name
) && (min_map_type_name
== "osd")) {
217 only_osd_affected
= true;
221 // check that we don't have any duplicate id's
222 for (vector
<int>::iterator it
= included_devices
.begin(); it
!= included_devices
.end(); ++it
) {
223 int num_copies
= std::count(included_devices
.begin(), included_devices
.end(), (*it
) );
224 if (num_copies
> 1) {
225 valid_placement
= false;
229 // if we have more than just osd's affected we need to do a lot more work
230 if (!only_osd_affected
) {
231 // loop through the devices that are "in/up"
232 for (vector
<int>::iterator it
= included_devices
.begin(); it
!= included_devices
.end(); ++it
) {
233 if (valid_placement
== false)
236 // create a temporary map of the form (device type, device name in map)
237 map
<string
,string
> device_location_hierarchy
= crush
.get_full_location(*it
);
239 // loop over the types affected by RULENO looking for duplicate bucket assignments
240 for (vector
<string
>::iterator t
= affected_types
.begin(); t
!= affected_types
.end(); ++t
) {
241 if (seen_devices
.count( device_location_hierarchy
[*t
])) {
242 valid_placement
= false;
245 // store the devices we have seen in the form of (device name, device type)
246 seen_devices
[ device_location_hierarchy
[*t
] ] = *t
;
252 return valid_placement
;
255 int CrushTester::random_placement(int ruleno
, vector
<int>& out
, int maxout
, vector
<__u32
>& weight
)
257 // get the total weight of the system
258 int total_weight
= 0;
259 for (unsigned i
= 0; i
< weight
.size(); i
++)
260 total_weight
+= weight
[i
];
262 if (total_weight
== 0 ||
263 crush
.get_max_devices() == 0)
266 // determine the real maximum number of devices to return
267 int devices_requested
= min(maxout
, get_maximum_affected_by_rule(ruleno
));
268 bool accept_placement
= false;
270 vector
<int> trial_placement(devices_requested
);
271 int attempted_tries
= 0;
274 // create a vector to hold our trial mappings
275 int temp_array
[devices_requested
];
276 for (int i
= 0; i
< devices_requested
; i
++){
277 temp_array
[i
] = lrand48() % (crush
.get_max_devices());
280 trial_placement
.assign(temp_array
, temp_array
+ devices_requested
);
281 accept_placement
= check_valid_placement(ruleno
, trial_placement
, weight
);
283 } while (accept_placement
== false && attempted_tries
< max_tries
);
285 // save our random placement to the out vector
286 if (accept_placement
)
287 out
.assign(trial_placement
.begin(), trial_placement
.end());
290 else if (attempted_tries
== max_tries
)
296 void CrushTester::write_integer_indexed_vector_data_string(vector
<string
> &dst
, int index
, vector
<int> vector_data
)
298 stringstream
data_buffer (stringstream::in
| stringstream::out
);
299 unsigned input_size
= vector_data
.size();
301 // pass the indexing variable to the data buffer
302 data_buffer
<< index
;
304 // pass the rest of the input data to the buffer
305 for (unsigned i
= 0; i
< input_size
; i
++) {
306 data_buffer
<< ',' << vector_data
[i
];
309 data_buffer
<< std::endl
;
311 // write the data buffer to the destination
312 dst
.push_back( data_buffer
.str() );
315 void CrushTester::write_integer_indexed_vector_data_string(vector
<string
> &dst
, int index
, vector
<float> vector_data
)
317 stringstream
data_buffer (stringstream::in
| stringstream::out
);
318 unsigned input_size
= vector_data
.size();
320 // pass the indexing variable to the data buffer
321 data_buffer
<< index
;
323 // pass the rest of the input data to the buffer
324 for (unsigned i
= 0; i
< input_size
; i
++) {
325 data_buffer
<< ',' << vector_data
[i
];
328 data_buffer
<< std::endl
;
330 // write the data buffer to the destination
331 dst
.push_back( data_buffer
.str() );
334 void CrushTester::write_integer_indexed_scalar_data_string(vector
<string
> &dst
, int index
, int scalar_data
)
336 stringstream
data_buffer (stringstream::in
| stringstream::out
);
338 // pass the indexing variable to the data buffer
339 data_buffer
<< index
;
341 // pass the input data to the buffer
342 data_buffer
<< ',' << scalar_data
;
343 data_buffer
<< std::endl
;
345 // write the data buffer to the destination
346 dst
.push_back( data_buffer
.str() );
348 void CrushTester::write_integer_indexed_scalar_data_string(vector
<string
> &dst
, int index
, float scalar_data
)
350 stringstream
data_buffer (stringstream::in
| stringstream::out
);
352 // pass the indexing variable to the data buffer
353 data_buffer
<< index
;
355 // pass the input data to the buffer
356 data_buffer
<< ',' << scalar_data
;
357 data_buffer
<< std::endl
;
359 // write the data buffer to the destination
360 dst
.push_back( data_buffer
.str() );
363 int CrushTester::test_with_fork(int timeout
)
366 int r
= fork_function(timeout
, sink
, [&]() {
369 if (r
== -ETIMEDOUT
) {
370 err
<< "timed out during smoke test (" << timeout
<< " seconds)";
376 class BadCrushMap
: public std::runtime_error
{
379 BadCrushMap(const char* msg
, int id
)
380 : std::runtime_error(msg
), item(id
) {}
382 // throws if any node in the crush fail to print
383 class CrushWalker
: public CrushTreeDumper::Dumper
<void> {
384 typedef void DumbFormatter
;
385 typedef CrushTreeDumper::Dumper
<DumbFormatter
> Parent
;
388 CrushWalker(const CrushWrapper
*crush
, unsigned max_id
)
389 : Parent(crush
), max_id(max_id
) {}
390 void dump_item(const CrushTreeDumper::Item
&qi
, DumbFormatter
*) override
{
392 if (qi
.is_bucket()) {
393 if (!crush
->get_item_name(qi
.id
)) {
394 throw BadCrushMap("unknown item name", qi
.id
);
396 type
= crush
->get_bucket_type(qi
.id
);
398 if (max_id
> 0 && qi
.id
>= max_id
) {
399 throw BadCrushMap("item id too large", qi
.id
);
403 if (!crush
->get_type_name(type
)) {
404 throw BadCrushMap("unknown type name", qi
.id
);
410 bool CrushTester::check_name_maps(unsigned max_id
) const
412 CrushWalker
crush_walker(&crush
, max_id
);
414 // walk through the crush, to see if its self-contained
415 crush_walker
.dump(NULL
);
416 // and see if the maps is also able to handle straying OSDs, whose id >= 0.
417 // "ceph osd tree" will try to print them, even they are not listed in the
419 crush_walker
.dump_item(CrushTreeDumper::Item(0, 0, 0), NULL
);
420 } catch (const BadCrushMap
& e
) {
421 err
<< e
.what() << ": item#" << e
.item
<< std::endl
;
427 static string
get_rule_name(CrushWrapper
& crush
, int rule
)
429 if (crush
.get_rule_name(rule
))
430 return crush
.get_rule_name(rule
);
432 return string("rule") + std::to_string(rule
);
435 void CrushTester::check_overlapped_rules() const
437 namespace icl
= boost::icl
;
438 typedef std::set
<string
> RuleNames
;
439 typedef icl::interval_map
<int, RuleNames
> Rules
;
440 // <ruleset, type> => interval_map<size, {names}>
441 typedef std::map
<std::pair
<int, int>, Rules
> RuleSets
;
442 using interval
= icl::interval
<int>;
444 // mimic the logic of crush_find_rule(), but it only return the first matched
445 // one, but I am collecting all of them by the overlapped sizes.
447 for (int rule
= 0; rule
< crush
.get_max_rules(); rule
++) {
448 if (!crush
.rule_exists(rule
)) {
451 Rules
& rules
= rulesets
[{crush
.get_rule_mask_ruleset(rule
),
452 crush
.get_rule_mask_type(rule
)}];
453 rules
+= make_pair(interval::closed(crush
.get_rule_mask_min_size(rule
),
454 crush
.get_rule_mask_max_size(rule
)),
455 RuleNames
{get_rule_name(crush
, rule
)});
457 for (auto i
: rulesets
) {
458 auto ruleset_type
= i
.first
;
459 const Rules
& rules
= i
.second
;
460 for (auto r
: rules
) {
461 const RuleNames
& names
= r
.second
;
462 // if there are more than one rules covering the same size range,
464 if (names
.size() > 1) {
465 err
<< "overlapped rules in ruleset " << ruleset_type
.first
<< ": "
466 << boost::join(names
, ", ") << "\n";
472 int CrushTester::test()
474 if (min_rule
< 0 || max_rule
< 0) {
476 max_rule
= crush
.get_max_rules() - 1;
478 if (min_x
< 0 || max_x
< 0) {
483 // initial osd weights
484 vector
<__u32
> weight
;
487 * note device weight is set by crushtool
488 * (likely due to a given a command line option)
490 for (int o
= 0; o
< crush
.get_max_devices(); o
++) {
491 if (device_weight
.count(o
)) {
492 weight
.push_back(device_weight
[o
]);
493 } else if (crush
.check_item_present(o
)) {
494 weight
.push_back(0x10000);
500 if (output_utilization_all
)
501 err
<< "devices weights (hex): " << hex
<< weight
<< dec
<< std::endl
;
504 adjust_weights(weight
);
507 int num_devices_active
= 0;
508 for (vector
<__u32
>::iterator p
= weight
.begin(); p
!= weight
.end(); ++p
)
510 num_devices_active
++;
512 if (output_choose_tries
)
513 crush
.start_choose_profile();
515 for (int r
= min_rule
; r
< crush
.get_max_rules() && r
<= max_rule
; r
++) {
516 if (!crush
.rule_exists(r
)) {
517 if (output_statistics
)
518 err
<< "rule " << r
<< " dne" << std::endl
;
522 crush
.get_rule_mask_ruleset(r
) != ruleset
) {
525 int minr
= min_rep
, maxr
= max_rep
;
526 if (min_rep
< 0 || max_rep
< 0) {
527 minr
= crush
.get_rule_mask_min_size(r
);
528 maxr
= crush
.get_rule_mask_max_size(r
);
531 if (output_statistics
)
532 err
<< "rule " << r
<< " (" << crush
.get_rule_name(r
)
533 << "), x = " << min_x
<< ".." << max_x
534 << ", numrep = " << minr
<< ".." << maxr
537 for (int nr
= minr
; nr
<= maxr
; nr
++) {
538 vector
<int> per(crush
.get_max_devices());
541 int num_objects
= ((max_x
- min_x
) + 1);
542 float num_devices
= (float) per
.size(); // get the total number of devices, better to cast as a float here
544 // create a structure to hold data for post-processing
545 tester_data_set tester_data
;
546 vector
<float> vector_data_buffer_f
;
548 // create a map to hold batch-level placement information
549 map
<int, vector
<int> > batch_per
;
550 int objects_per_batch
= num_objects
/ num_batches
;
551 int batch_min
= min_x
;
552 int batch_max
= min_x
+ objects_per_batch
- 1;
554 // get the total weight of the system
555 int total_weight
= 0;
556 for (unsigned i
= 0; i
< per
.size(); i
++)
557 total_weight
+= weight
[i
];
559 if (total_weight
== 0)
562 // compute the expected number of objects stored per device in the absence of weighting
563 float expected_objects
= min(nr
, get_maximum_affected_by_rule(r
)) * num_objects
;
565 // compute each device's proportional weight
566 vector
<float> proportional_weights( per
.size() );
568 for (unsigned i
= 0; i
< per
.size(); i
++)
569 proportional_weights
[i
] = (float) weight
[i
] / (float) total_weight
;
571 if (output_data_file
) {
572 // stage the absolute weight information for post-processing
573 for (unsigned i
= 0; i
< per
.size(); i
++) {
574 tester_data
.absolute_weights
[i
] = (float) weight
[i
] / (float)0x10000;
577 // stage the proportional weight information for post-processing
578 for (unsigned i
= 0; i
< per
.size(); i
++) {
579 if (proportional_weights
[i
] > 0 )
580 tester_data
.proportional_weights
[i
] = proportional_weights
[i
];
582 tester_data
.proportional_weights_all
[i
] = proportional_weights
[i
];
586 // compute the expected number of objects stored per device when a device's weight is considered
587 vector
<float> num_objects_expected(num_devices
);
589 for (unsigned i
= 0; i
< num_devices
; i
++)
590 num_objects_expected
[i
] = (proportional_weights
[i
]*expected_objects
);
592 for (int current_batch
= 0; current_batch
< num_batches
; current_batch
++) {
593 if (current_batch
== (num_batches
- 1)) {
595 objects_per_batch
= (batch_max
- batch_min
+ 1);
598 float batch_expected_objects
= min(nr
, get_maximum_affected_by_rule(r
)) * objects_per_batch
;
599 vector
<float> batch_num_objects_expected( per
.size() );
601 for (unsigned i
= 0; i
< per
.size() ; i
++)
602 batch_num_objects_expected
[i
] = (proportional_weights
[i
]*batch_expected_objects
);
604 // create a vector to hold placement results temporarily
605 vector
<int> temporary_per ( per
.size() );
607 for (int x
= batch_min
; x
<= batch_max
; x
++) {
608 // create a vector to hold the results of a CRUSH placement or RNG simulation
613 err
<< "CRUSH"; // prepend CRUSH to placement output
616 real_x
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, x
, (uint32_t)pool_id
);
618 crush
.do_rule(r
, real_x
, out
, nr
, weight
, 0);
621 err
<< "RNG"; // prepend RNG to placement output to denote simulation
622 // test our new monte carlo placement generator
623 random_placement(r
, out
, nr
, weight
);
627 err
<< " rule " << r
<< " x " << x
<< " " << out
<< std::endl
;
629 if (output_data_file
)
630 write_integer_indexed_vector_data_string(tester_data
.placement_information
, x
, out
);
632 bool has_item_none
= false;
633 for (unsigned i
= 0; i
< out
.size(); i
++) {
634 if (out
[i
] != CRUSH_ITEM_NONE
) {
636 temporary_per
[out
[i
]]++;
638 has_item_none
= true;
642 batch_per
[current_batch
] = temporary_per
;
644 if (output_bad_mappings
&&
645 (out
.size() != (unsigned)nr
||
647 err
<< "bad mapping rule " << r
<< " x " << x
<< " num_rep " << nr
<< " result " << out
<< std::endl
;
651 batch_min
= batch_max
+ 1;
652 batch_max
= batch_min
+ objects_per_batch
- 1;
655 for (unsigned i
= 0; i
< per
.size(); i
++)
656 if (output_utilization
&& !output_statistics
)
657 err
<< " device " << i
658 << ":\t" << per
[i
] << std::endl
;
660 for (map
<int,int>::iterator p
= sizes
.begin(); p
!= sizes
.end(); ++p
)
661 if (output_statistics
)
662 err
<< "rule " << r
<< " (" << crush
.get_rule_name(r
) << ") num_rep " << nr
663 << " result size == " << p
->first
<< ":\t"
664 << p
->second
<< "/" << (max_x
-min_x
+1) << std::endl
;
666 if (output_statistics
)
667 for (unsigned i
= 0; i
< per
.size(); i
++) {
668 if (output_utilization
) {
669 if (num_objects_expected
[i
] > 0 && per
[i
] > 0) {
670 err
<< " device " << i
<< ":\t"
671 << "\t" << " stored " << ": " << per
[i
]
672 << "\t" << " expected " << ": " << num_objects_expected
[i
]
675 } else if (output_utilization_all
) {
676 err
<< " device " << i
<< ":\t"
677 << "\t" << " stored " << ": " << per
[i
]
678 << "\t" << " expected " << ": " << num_objects_expected
[i
]
683 if (output_data_file
)
684 for (unsigned i
= 0; i
< per
.size(); i
++) {
685 vector_data_buffer_f
.clear();
686 vector_data_buffer_f
.push_back( (float) per
[i
]);
687 vector_data_buffer_f
.push_back( (float) num_objects_expected
[i
]);
689 write_integer_indexed_vector_data_string(tester_data
.device_utilization_all
, i
, vector_data_buffer_f
);
691 if (num_objects_expected
[i
] > 0 && per
[i
] > 0)
692 write_integer_indexed_vector_data_string(tester_data
.device_utilization
, i
, vector_data_buffer_f
);
695 if (output_data_file
&& num_batches
> 1) {
696 // stage batch utilization information for post-processing
697 for (int i
= 0; i
< num_batches
; i
++) {
698 write_integer_indexed_vector_data_string(tester_data
.batch_device_utilization_all
, i
, batch_per
[i
]);
699 write_integer_indexed_vector_data_string(tester_data
.batch_device_expected_utilization_all
, i
, batch_per
[i
]);
703 string rule_tag
= crush
.get_rule_name(r
);
706 write_data_set_to_csv(output_data_file_name
+rule_tag
,tester_data
);
710 if (output_choose_tries
) {
712 int n
= crush
.get_choose_profile(&v
);
713 for (int i
=0; i
<n
; i
++) {
714 cout
.setf(std::ios::right
);
716 << i
<< ": " << std::setw(9) << v
[i
];
717 cout
.unsetf(std::ios::right
);
721 crush
.stop_choose_profile();