1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
8 #include <boost/lexical_cast.hpp>
9 #include <boost/icl/interval_map.hpp>
10 #include <boost/algorithm/string/join.hpp>
12 #include "common/SubProcess.h"
13 #include "common/fork_function.h"
15 #include "include/stringify.h"
16 #include "CrushTester.h"
17 #include "CrushTreeDumper.h"
18 #include "include/ceph_features.h"
24 using std::ostringstream
;
26 using std::stringstream
;
29 void CrushTester::set_device_weight(int dev
, float f
)
31 int w
= (int)(f
* 0x10000);
36 device_weight
[dev
] = w
;
39 int CrushTester::get_maximum_affected_by_rule(int ruleno
)
41 // get the number of steps in RULENO
42 int rule_size
= crush
.get_rule_len(ruleno
);
43 vector
<int> affected_types
;
44 map
<int,int> replications_by_type
;
46 for (int i
= 0; i
< rule_size
; i
++){
47 // get what operation is done by the current step
48 int rule_operation
= crush
.get_rule_op(ruleno
, i
);
50 // if the operation specifies choosing a device type, store it
51 if (rule_operation
>= 2 && rule_operation
!= 4){
52 int desired_replication
= crush
.get_rule_arg1(ruleno
,i
);
53 int affected_type
= crush
.get_rule_arg2(ruleno
,i
);
54 affected_types
.push_back(affected_type
);
55 replications_by_type
[affected_type
] = desired_replication
;
60 * now for each of the affected bucket types, see what is the
61 * maximum we are (a) requesting or (b) have
64 map
<int,int> max_devices_of_type
;
66 // loop through the vector of affected types
67 for (vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
68 // loop through the number of buckets looking for affected types
69 for (map
<int,string
>::iterator p
= crush
.name_map
.begin(); p
!= crush
.name_map
.end(); ++p
){
70 int bucket_type
= crush
.get_bucket_type(p
->first
);
71 if ( bucket_type
== *it
)
72 max_devices_of_type
[*it
]++;
76 for(std::vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
77 if ( replications_by_type
[*it
] > 0 && replications_by_type
[*it
] < max_devices_of_type
[*it
] )
78 max_devices_of_type
[*it
] = replications_by_type
[*it
];
82 * get the smallest number of buckets available of any type as this is our upper bound on
83 * the number of replicas we can place
85 int max_affected
= std::max( crush
.get_max_buckets(), crush
.get_max_devices() );
87 for(std::vector
<int>::iterator it
= affected_types
.begin(); it
!= affected_types
.end(); ++it
){
88 if (max_devices_of_type
[*it
] > 0 && max_devices_of_type
[*it
] < max_affected
)
89 max_affected
= max_devices_of_type
[*it
];
96 map
<int,int> CrushTester::get_collapsed_mapping()
98 int num_to_check
= crush
.get_max_devices();
100 map
<int, int> collapse_mask
;
102 for (int i
= 0; i
< num_to_check
; i
++){
103 if (crush
.check_item_present(i
)){
104 collapse_mask
[i
] = next_id
;
109 return collapse_mask
;
112 void CrushTester::adjust_weights(vector
<__u32
>& weight
)
115 if (mark_down_device_ratio
> 0) {
117 vector
<int> bucket_ids
;
118 for (int i
= 0; i
< crush
.get_max_buckets(); i
++) {
120 if (crush
.get_bucket_weight(id
) > 0) {
121 bucket_ids
.push_back(id
);
125 // get buckets that are one level above a device
126 vector
<int> buckets_above_devices
;
127 for (unsigned i
= 0; i
< bucket_ids
.size(); i
++) {
128 // grab the first child object of a bucket and check if it's ID is less than 0
129 int id
= bucket_ids
[i
];
130 if (crush
.get_bucket_size(id
) == 0)
132 int first_child
= crush
.get_bucket_item(id
, 0); // returns the ID of the bucket or device
133 if (first_child
>= 0) {
134 buckets_above_devices
.push_back(id
);
138 // permute bucket list
139 for (unsigned i
= 0; i
< buckets_above_devices
.size(); i
++) {
140 unsigned j
= lrand48() % (buckets_above_devices
.size() - 1);
141 std::swap(buckets_above_devices
[i
], buckets_above_devices
[j
]);
144 // calculate how many buckets and devices we need to reap...
145 int num_buckets_to_visit
= (int) (mark_down_bucket_ratio
* buckets_above_devices
.size());
147 for (int i
= 0; i
< num_buckets_to_visit
; i
++) {
148 int id
= buckets_above_devices
[i
];
149 int size
= crush
.get_bucket_size(id
);
151 for (int o
= 0; o
< size
; o
++)
152 items
.push_back(crush
.get_bucket_item(id
, o
));
155 for (int o
= 0; o
< size
; o
++) {
156 int j
= lrand48() % (crush
.get_bucket_size(id
) - 1);
157 std::swap(items
[o
], items
[j
]);
160 int local_devices_to_visit
= (int) (mark_down_device_ratio
*size
);
161 for (int o
= 0; o
< local_devices_to_visit
; o
++){
162 int item
= crush
.get_bucket_item(id
, o
);
169 bool CrushTester::check_valid_placement(int ruleno
, vector
<int> in
, const vector
<__u32
>& weight
)
172 bool valid_placement
= true;
173 vector
<int> included_devices
;
174 map
<string
,string
> seen_devices
;
176 // first do the easy check that all devices are "up"
177 for (vector
<int>::iterator it
= in
.begin(); it
!= in
.end(); ++it
) {
178 if (weight
[(*it
)] == 0) {
179 valid_placement
= false;
181 } else if (weight
[(*it
)] > 0) {
182 included_devices
.push_back( (*it
) );
187 * now do the harder test of checking that the CRUSH rule r is not violated
188 * we could test that none of the devices mentioned in out are unique,
189 * but this is a special case of this test
192 // get the number of steps in RULENO
193 int rule_size
= crush
.get_rule_len(ruleno
);
194 vector
<string
> affected_types
;
196 // get the smallest type id, and name
197 int min_map_type
= crush
.get_num_type_names();
198 for (map
<int,string
>::iterator it
= crush
.type_map
.begin(); it
!= crush
.type_map
.end(); ++it
) {
199 if ( (*it
).first
< min_map_type
) {
200 min_map_type
= (*it
).first
;
204 string min_map_type_name
= crush
.type_map
[min_map_type
];
206 // get the types of devices affected by RULENO
207 for (int i
= 0; i
< rule_size
; i
++) {
208 // get what operation is done by the current step
209 int rule_operation
= crush
.get_rule_op(ruleno
, i
);
211 // if the operation specifies choosing a device type, store it
212 if (rule_operation
>= 2 && rule_operation
!= 4) {
213 int affected_type
= crush
.get_rule_arg2(ruleno
,i
);
214 affected_types
.push_back( crush
.get_type_name(affected_type
));
218 // find in if we are only dealing with osd's
219 bool only_osd_affected
= false;
220 if (affected_types
.size() == 1) {
221 if ((affected_types
.back() == min_map_type_name
) && (min_map_type_name
== "osd")) {
222 only_osd_affected
= true;
226 // check that we don't have any duplicate id's
227 for (vector
<int>::iterator it
= included_devices
.begin(); it
!= included_devices
.end(); ++it
) {
228 int num_copies
= std::count(included_devices
.begin(), included_devices
.end(), (*it
) );
229 if (num_copies
> 1) {
230 valid_placement
= false;
234 // if we have more than just osd's affected we need to do a lot more work
235 if (!only_osd_affected
) {
236 // loop through the devices that are "in/up"
237 for (vector
<int>::iterator it
= included_devices
.begin(); it
!= included_devices
.end(); ++it
) {
238 if (valid_placement
== false)
241 // create a temporary map of the form (device type, device name in map)
242 map
<string
,string
> device_location_hierarchy
= crush
.get_full_location(*it
);
244 // loop over the types affected by RULENO looking for duplicate bucket assignments
245 for (vector
<string
>::iterator t
= affected_types
.begin(); t
!= affected_types
.end(); ++t
) {
246 if (seen_devices
.count( device_location_hierarchy
[*t
])) {
247 valid_placement
= false;
250 // store the devices we have seen in the form of (device name, device type)
251 seen_devices
[ device_location_hierarchy
[*t
] ] = *t
;
257 return valid_placement
;
260 int CrushTester::random_placement(int ruleno
, vector
<int>& out
, int maxout
, vector
<__u32
>& weight
)
262 // get the total weight of the system
263 int total_weight
= 0;
264 for (unsigned i
= 0; i
< weight
.size(); i
++)
265 total_weight
+= weight
[i
];
267 if (total_weight
== 0 ||
268 crush
.get_max_devices() == 0)
271 // determine the real maximum number of devices to return
272 int devices_requested
= std::min(maxout
, get_maximum_affected_by_rule(ruleno
));
273 bool accept_placement
= false;
275 vector
<int> trial_placement(devices_requested
);
276 int attempted_tries
= 0;
279 // create a vector to hold our trial mappings
280 int temp_array
[devices_requested
];
281 for (int i
= 0; i
< devices_requested
; i
++){
282 temp_array
[i
] = lrand48() % (crush
.get_max_devices());
285 trial_placement
.assign(temp_array
, temp_array
+ devices_requested
);
286 accept_placement
= check_valid_placement(ruleno
, trial_placement
, weight
);
288 } while (accept_placement
== false && attempted_tries
< max_tries
);
290 // save our random placement to the out vector
291 if (accept_placement
)
292 out
.assign(trial_placement
.begin(), trial_placement
.end());
295 else if (attempted_tries
== max_tries
)
301 void CrushTester::write_integer_indexed_vector_data_string(vector
<string
> &dst
, int index
, vector
<int> vector_data
)
303 stringstream
data_buffer (stringstream::in
| stringstream::out
);
304 unsigned input_size
= vector_data
.size();
306 // pass the indexing variable to the data buffer
307 data_buffer
<< index
;
309 // pass the rest of the input data to the buffer
310 for (unsigned i
= 0; i
< input_size
; i
++) {
311 data_buffer
<< ',' << vector_data
[i
];
314 data_buffer
<< std::endl
;
316 // write the data buffer to the destination
317 dst
.push_back( data_buffer
.str() );
320 void CrushTester::write_integer_indexed_vector_data_string(vector
<string
> &dst
, int index
, vector
<float> vector_data
)
322 stringstream
data_buffer (stringstream::in
| stringstream::out
);
323 unsigned input_size
= vector_data
.size();
325 // pass the indexing variable to the data buffer
326 data_buffer
<< index
;
328 // pass the rest of the input data to the buffer
329 for (unsigned i
= 0; i
< input_size
; i
++) {
330 data_buffer
<< ',' << vector_data
[i
];
333 data_buffer
<< std::endl
;
335 // write the data buffer to the destination
336 dst
.push_back( data_buffer
.str() );
339 void CrushTester::write_integer_indexed_scalar_data_string(vector
<string
> &dst
, int index
, int scalar_data
)
341 stringstream
data_buffer (stringstream::in
| stringstream::out
);
343 // pass the indexing variable to the data buffer
344 data_buffer
<< index
;
346 // pass the input data to the buffer
347 data_buffer
<< ',' << scalar_data
;
348 data_buffer
<< std::endl
;
350 // write the data buffer to the destination
351 dst
.push_back( data_buffer
.str() );
353 void CrushTester::write_integer_indexed_scalar_data_string(vector
<string
> &dst
, int index
, float scalar_data
)
355 stringstream
data_buffer (stringstream::in
| stringstream::out
);
357 // pass the indexing variable to the data buffer
358 data_buffer
<< index
;
360 // pass the input data to the buffer
361 data_buffer
<< ',' << scalar_data
;
362 data_buffer
<< std::endl
;
364 // write the data buffer to the destination
365 dst
.push_back( data_buffer
.str() );
368 int CrushTester::test_with_fork(int timeout
)
371 int r
= fork_function(timeout
, sink
, [&]() {
374 if (r
== -ETIMEDOUT
) {
375 err
<< "timed out during smoke test (" << timeout
<< " seconds)";
381 class BadCrushMap
: public std::runtime_error
{
384 BadCrushMap(const char* msg
, int id
)
385 : std::runtime_error(msg
), item(id
) {}
387 // throws if any node in the crush fail to print
388 class CrushWalker
: public CrushTreeDumper::Dumper
<void> {
389 typedef void DumbFormatter
;
390 typedef CrushTreeDumper::Dumper
<DumbFormatter
> Parent
;
393 CrushWalker(const CrushWrapper
*crush
, unsigned max_id
)
394 : Parent(crush
, CrushTreeDumper::name_map_t()), max_id(max_id
) {}
395 void dump_item(const CrushTreeDumper::Item
&qi
, DumbFormatter
*) override
{
397 if (qi
.is_bucket()) {
398 if (!crush
->get_item_name(qi
.id
)) {
399 throw BadCrushMap("unknown item name", qi
.id
);
401 type
= crush
->get_bucket_type(qi
.id
);
403 if (max_id
> 0 && qi
.id
>= max_id
) {
404 throw BadCrushMap("item id too large", qi
.id
);
408 if (!crush
->get_type_name(type
)) {
409 throw BadCrushMap("unknown type name", qi
.id
);
415 bool CrushTester::check_name_maps(unsigned max_id
) const
417 CrushWalker
crush_walker(&crush
, max_id
);
419 // walk through the crush, to see if its self-contained
420 crush_walker
.dump(NULL
);
421 // and see if the maps is also able to handle straying OSDs, whose id >= 0.
422 // "ceph osd tree" will try to print them, even they are not listed in the
424 crush_walker
.dump_item(CrushTreeDumper::Item(0, 0, 0, 0), NULL
);
425 } catch (const BadCrushMap
& e
) {
426 err
<< e
.what() << ": item#" << e
.item
<< std::endl
;
432 int CrushTester::test()
434 if (min_rule
< 0 || max_rule
< 0) {
436 max_rule
= crush
.get_max_rules() - 1;
438 if (min_x
< 0 || max_x
< 0) {
442 if (min_rep
< 0 && max_rep
< 0) {
443 cerr
<< "must specify --num-rep or both --min-rep and --max-rep" << std::endl
;
447 // initial osd weights
448 vector
<__u32
> weight
;
451 * note device weight is set by crushtool
452 * (likely due to a given a command line option)
454 for (int o
= 0; o
< crush
.get_max_devices(); o
++) {
455 if (device_weight
.count(o
)) {
456 weight
.push_back(device_weight
[o
]);
457 } else if (crush
.check_item_present(o
)) {
458 weight
.push_back(0x10000);
464 if (output_utilization_all
)
465 cerr
<< "devices weights (hex): " << std::hex
<< weight
<< std::dec
<< std::endl
;
468 adjust_weights(weight
);
471 int num_devices_active
= 0;
472 for (vector
<__u32
>::iterator p
= weight
.begin(); p
!= weight
.end(); ++p
)
474 num_devices_active
++;
476 if (output_choose_tries
)
477 crush
.start_choose_profile();
479 for (int r
= min_rule
; r
< crush
.get_max_rules() && r
<= max_rule
; r
++) {
480 if (!crush
.rule_exists(r
)) {
481 if (output_statistics
)
482 err
<< "rule " << r
<< " dne" << std::endl
;
486 if (output_statistics
)
487 err
<< "rule " << r
<< " (" << crush
.get_rule_name(r
)
488 << "), x = " << min_x
<< ".." << max_x
489 << ", numrep = " << min_rep
<< ".." << max_rep
492 for (int nr
= min_rep
; nr
<= max_rep
; nr
++) {
493 vector
<int> per(crush
.get_max_devices());
496 int num_objects
= ((max_x
- min_x
) + 1);
497 float num_devices
= (float) per
.size(); // get the total number of devices, better to cast as a float here
499 // create a structure to hold data for post-processing
500 tester_data_set tester_data
;
501 vector
<float> vector_data_buffer_f
;
503 // create a map to hold batch-level placement information
504 map
<int, vector
<int> > batch_per
;
505 int objects_per_batch
= num_objects
/ num_batches
;
506 int batch_min
= min_x
;
507 int batch_max
= min_x
+ objects_per_batch
- 1;
509 // get the total weight of the system
510 int total_weight
= 0;
511 for (unsigned i
= 0; i
< per
.size(); i
++)
512 total_weight
+= weight
[i
];
514 if (total_weight
== 0)
517 // compute the expected number of objects stored per device in the absence of weighting
518 float expected_objects
= std::min(nr
, get_maximum_affected_by_rule(r
)) * num_objects
;
520 // compute each device's proportional weight
521 vector
<float> proportional_weights( per
.size() );
523 for (unsigned i
= 0; i
< per
.size(); i
++)
524 proportional_weights
[i
] = (float) weight
[i
] / (float) total_weight
;
526 if (output_data_file
) {
527 // stage the absolute weight information for post-processing
528 for (unsigned i
= 0; i
< per
.size(); i
++) {
529 tester_data
.absolute_weights
[i
] = (float) weight
[i
] / (float)0x10000;
532 // stage the proportional weight information for post-processing
533 for (unsigned i
= 0; i
< per
.size(); i
++) {
534 if (proportional_weights
[i
] > 0 )
535 tester_data
.proportional_weights
[i
] = proportional_weights
[i
];
537 tester_data
.proportional_weights_all
[i
] = proportional_weights
[i
];
541 // compute the expected number of objects stored per device when a device's weight is considered
542 vector
<float> num_objects_expected(num_devices
);
544 for (unsigned i
= 0; i
< num_devices
; i
++)
545 num_objects_expected
[i
] = (proportional_weights
[i
]*expected_objects
);
547 for (int current_batch
= 0; current_batch
< num_batches
; current_batch
++) {
548 if (current_batch
== (num_batches
- 1)) {
550 objects_per_batch
= (batch_max
- batch_min
+ 1);
553 float batch_expected_objects
= std::min(nr
, get_maximum_affected_by_rule(r
)) * objects_per_batch
;
554 vector
<float> batch_num_objects_expected( per
.size() );
556 for (unsigned i
= 0; i
< per
.size() ; i
++)
557 batch_num_objects_expected
[i
] = (proportional_weights
[i
]*batch_expected_objects
);
559 // create a vector to hold placement results temporarily
560 vector
<int> temporary_per ( per
.size() );
562 for (int x
= batch_min
; x
<= batch_max
; x
++) {
563 // create a vector to hold the results of a CRUSH placement or RNG simulation
568 err
<< "CRUSH"; // prepend CRUSH to placement output
571 real_x
= crush_hash32_2(CRUSH_HASH_RJENKINS1
, x
, (uint32_t)pool_id
);
573 crush
.do_rule(r
, real_x
, out
, nr
, weight
, 0);
576 err
<< "RNG"; // prepend RNG to placement output to denote simulation
577 // test our new monte carlo placement generator
578 random_placement(r
, out
, nr
, weight
);
582 err
<< " rule " << r
<< " x " << x
<< " " << out
<< std::endl
;
584 if (output_data_file
)
585 write_integer_indexed_vector_data_string(tester_data
.placement_information
, x
, out
);
587 bool has_item_none
= false;
588 for (unsigned i
= 0; i
< out
.size(); i
++) {
589 if (out
[i
] != CRUSH_ITEM_NONE
) {
591 temporary_per
[out
[i
]]++;
593 has_item_none
= true;
597 batch_per
[current_batch
] = temporary_per
;
599 if (output_bad_mappings
&&
600 (out
.size() != (unsigned)nr
||
602 err
<< "bad mapping rule " << r
<< " x " << x
<< " num_rep " << nr
<< " result " << out
<< std::endl
;
606 batch_min
= batch_max
+ 1;
607 batch_max
= batch_min
+ objects_per_batch
- 1;
610 for (unsigned i
= 0; i
< per
.size(); i
++)
611 if (output_utilization
&& !output_statistics
)
612 err
<< " device " << i
613 << ":\t" << per
[i
] << std::endl
;
615 for (map
<int,int>::iterator p
= sizes
.begin(); p
!= sizes
.end(); ++p
)
616 if (output_statistics
)
617 err
<< "rule " << r
<< " (" << crush
.get_rule_name(r
) << ") num_rep " << nr
618 << " result size == " << p
->first
<< ":\t"
619 << p
->second
<< "/" << (max_x
-min_x
+1) << std::endl
;
621 if (output_statistics
)
622 for (unsigned i
= 0; i
< per
.size(); i
++) {
623 if (output_utilization
) {
624 if (num_objects_expected
[i
] > 0 && per
[i
] > 0) {
625 err
<< " device " << i
<< ":\t"
626 << "\t" << " stored " << ": " << per
[i
]
627 << "\t" << " expected " << ": " << num_objects_expected
[i
]
630 } else if (output_utilization_all
) {
631 err
<< " device " << i
<< ":\t"
632 << "\t" << " stored " << ": " << per
[i
]
633 << "\t" << " expected " << ": " << num_objects_expected
[i
]
638 if (output_data_file
)
639 for (unsigned i
= 0; i
< per
.size(); i
++) {
640 vector_data_buffer_f
.clear();
641 vector_data_buffer_f
.push_back( (float) per
[i
]);
642 vector_data_buffer_f
.push_back( (float) num_objects_expected
[i
]);
644 write_integer_indexed_vector_data_string(tester_data
.device_utilization_all
, i
, vector_data_buffer_f
);
646 if (num_objects_expected
[i
] > 0 && per
[i
] > 0)
647 write_integer_indexed_vector_data_string(tester_data
.device_utilization
, i
, vector_data_buffer_f
);
650 if (output_data_file
&& num_batches
> 1) {
651 // stage batch utilization information for post-processing
652 for (int i
= 0; i
< num_batches
; i
++) {
653 write_integer_indexed_vector_data_string(tester_data
.batch_device_utilization_all
, i
, batch_per
[i
]);
654 write_integer_indexed_vector_data_string(tester_data
.batch_device_expected_utilization_all
, i
, batch_per
[i
]);
658 string rule_tag
= crush
.get_rule_name(r
);
661 write_data_set_to_csv(output_data_file_name
+rule_tag
,tester_data
);
665 if (output_choose_tries
) {
667 int n
= crush
.get_choose_profile(&v
);
668 for (int i
=0; i
<n
; i
++) {
669 cout
.setf(std::ios::right
);
671 << i
<< ": " << std::setw(9) << v
[i
];
672 cout
.unsetf(std::ios::right
);
676 crush
.stop_choose_profile();
682 int CrushTester::compare(CrushWrapper
& crush2
)
684 if (min_rule
< 0 || max_rule
< 0) {
686 max_rule
= crush
.get_max_rules() - 1;
688 if (min_x
< 0 || max_x
< 0) {
693 // initial osd weights
694 vector
<__u32
> weight
;
697 * note device weight is set by crushtool
698 * (likely due to a given a command line option)
700 for (int o
= 0; o
< crush
.get_max_devices(); o
++) {
701 if (device_weight
.count(o
)) {
702 weight
.push_back(device_weight
[o
]);
703 } else if (crush
.check_item_present(o
)) {
704 weight
.push_back(0x10000);
711 adjust_weights(weight
);
713 map
<int,int> bad_by_rule
;
716 for (int r
= min_rule
; r
< crush
.get_max_rules() && r
<= max_rule
; r
++) {
717 if (!crush
.rule_exists(r
)) {
718 if (output_statistics
)
719 err
<< "rule " << r
<< " dne" << std::endl
;
723 for (int nr
= min_rep
; nr
<= max_rep
; nr
++) {
724 for (int x
= min_x
; x
<= max_x
; ++x
) {
726 crush
.do_rule(r
, x
, out
, nr
, weight
, 0);
728 crush2
.do_rule(r
, x
, out2
, nr
, weight
, 0);
737 int max
= (max_rep
- min_rep
+ 1) * (max_x
- min_x
+ 1);
738 double ratio
= (double)bad
/ (double)max
;
739 cout
<< "rule " << r
<< " had " << bad
<< "/" << max
740 << " mismatched mappings (" << ratio
<< ")" << std::endl
;
743 cerr
<< "warning: maps are NOT equivalent" << std::endl
;
745 cout
<< "maps appear equivalent" << std::endl
;