1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #include "gtest/gtest.h"
3 #include "osd/OSDMap.h"
4 #include "osd/OSDMapMapping.h"
5 #include "mon/OSDMonitor.h"
8 #include "global/global_context.h"
9 #include "global/global_init.h"
10 #include "common/common_init.h"
11 #include "common/ceph_argparse.h"
12 #include "common/ceph_json.h"
18 int main(int argc
, char **argv
) {
19 map
<string
,string
> defaults
= {
20 // make sure we have 3 copies, or some tests won't work
21 { "osd_pool_default_size", "3" },
22 // our map is flat, so just try and split across OSDs, not hosts or whatever
23 { "osd_crush_chooseleaf_type", "0" },
25 std::vector
<const char*> args(argv
, argv
+argc
);
26 auto cct
= global_init(&defaults
, args
, CEPH_ENTITY_TYPE_CLIENT
,
27 CODE_ENVIRONMENT_UTILITY
,
28 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE
);
29 common_init_finish(g_ceph_context
);
30 ::testing::InitGoogleTest(&argc
, argv
);
31 return RUN_ALL_TESTS();
34 class OSDMapTest
: public testing::Test
{
38 OSDMapMapping mapping
;
39 const uint64_t my_ec_pool
= 1;
40 const uint64_t my_rep_pool
= 2;
45 void set_up_map(int new_num_osds
= 6, bool no_default_pools
= false) {
46 num_osds
= new_num_osds
;
48 osdmap
.build_simple(g_ceph_context
, 0, fsid
, num_osds
);
49 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
50 pending_inc
.fsid
= osdmap
.get_fsid();
51 entity_addrvec_t sample_addrs
;
52 sample_addrs
.v
.push_back(entity_addr_t());
54 for (int i
= 0; i
< num_osds
; ++i
) {
55 sample_uuid
.generate_random();
56 sample_addrs
.v
[0].nonce
= i
;
57 pending_inc
.new_state
[i
] = CEPH_OSD_EXISTS
| CEPH_OSD_NEW
;
58 pending_inc
.new_up_client
[i
] = sample_addrs
;
59 pending_inc
.new_up_cluster
[i
] = sample_addrs
;
60 pending_inc
.new_hb_back_up
[i
] = sample_addrs
;
61 pending_inc
.new_hb_front_up
[i
] = sample_addrs
;
62 pending_inc
.new_weight
[i
] = CEPH_OSD_IN
;
63 pending_inc
.new_uuid
[i
] = sample_uuid
;
65 osdmap
.apply_incremental(pending_inc
);
66 if (no_default_pools
) // do not create any default pool(s)
69 // Create an EC ruleset and a pool using it
70 int r
= osdmap
.crush
->add_simple_rule(
71 "erasure", "default", "osd", "",
72 "indep", pg_pool_t::TYPE_ERASURE
,
75 OSDMap::Incremental
new_pool_inc(osdmap
.get_epoch() + 1);
76 new_pool_inc
.new_pool_max
= osdmap
.get_pool_max();
77 new_pool_inc
.fsid
= osdmap
.get_fsid();
80 uint64_t pool_id
= ++new_pool_inc
.new_pool_max
;
81 ceph_assert(pool_id
== my_ec_pool
);
82 pg_pool_t
*p
= new_pool_inc
.get_new_pool(pool_id
, &empty
);
86 p
->type
= pg_pool_t::TYPE_ERASURE
;
88 new_pool_inc
.new_pool_names
[pool_id
] = "ec";
89 // and a replicated pool
90 pool_id
= ++new_pool_inc
.new_pool_max
;
91 ceph_assert(pool_id
== my_rep_pool
);
92 p
= new_pool_inc
.get_new_pool(pool_id
, &empty
);
96 p
->type
= pg_pool_t::TYPE_REPLICATED
;
98 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
99 new_pool_inc
.new_pool_names
[pool_id
] = "reppool";
100 osdmap
.apply_incremental(new_pool_inc
);
102 unsigned int get_num_osds() { return num_osds
; }
103 void get_crush(const OSDMap
& tmap
, CrushWrapper
& newcrush
) {
105 tmap
.crush
->encode(bl
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
106 auto p
= bl
.cbegin();
109 int crush_move(OSDMap
& tmap
, const string
&name
, const vector
<string
> &argvec
) {
110 map
<string
,string
> loc
;
111 CrushWrapper::parse_loc_map(argvec
, &loc
);
112 CrushWrapper newcrush
;
113 get_crush(tmap
, newcrush
);
114 if (!newcrush
.name_exists(name
)) {
117 int id
= newcrush
.get_item_id(name
);
119 if (!newcrush
.check_item_loc(g_ceph_context
, id
, loc
, (int *)NULL
)) {
121 err
= newcrush
.create_or_move_item(g_ceph_context
, id
, 0, name
, loc
);
123 err
= newcrush
.move_bucket(g_ceph_context
, id
, loc
);
126 OSDMap::Incremental
pending_inc(tmap
.get_epoch() + 1);
127 pending_inc
.crush
.clear();
128 newcrush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
129 tmap
.apply_incremental(pending_inc
);
138 int crush_rule_create_replicated(const string
&name
,
140 const string
&type
) {
141 if (osdmap
.crush
->rule_exists(name
)) {
142 return osdmap
.crush
->get_rule_id(name
);
144 CrushWrapper newcrush
;
145 get_crush(osdmap
, newcrush
);
148 int ruleno
= newcrush
.add_simple_rule(
149 name
, root
, type
, device_class
,
150 "firstn", pg_pool_t::TYPE_REPLICATED
, &ss
);
152 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
153 pending_inc
.crush
.clear();
154 newcrush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
155 osdmap
.apply_incremental(pending_inc
);
159 void test_mappings(int pool
,
163 vector
<int> *primary
) {
164 mapping
.update(osdmap
);
165 for (int i
=0; i
<num
; ++i
) {
166 vector
<int> up
, acting
;
167 int up_primary
, acting_primary
;
169 osdmap
.pg_to_up_acting_osds(pgid
,
170 &up
, &up_primary
, &acting
, &acting_primary
);
171 for (unsigned j
=0; j
<acting
.size(); ++j
)
174 (*first
)[acting
[0]]++;
175 if (acting_primary
>= 0)
176 (*primary
)[acting_primary
]++;
178 // compare to precalc mapping
179 vector
<int> up2
, acting2
;
180 int up_primary2
, acting_primary2
;
181 pgid
= osdmap
.raw_pg_to_pg(pgid
);
182 mapping
.get(pgid
, &up2
, &up_primary2
, &acting2
, &acting_primary2
);
184 ASSERT_EQ(up_primary
, up_primary2
);
185 ASSERT_EQ(acting
, acting2
);
186 ASSERT_EQ(acting_primary
, acting_primary2
);
188 cout
<< "any: " << *any
<< std::endl
;;
189 cout
<< "first: " << *first
<< std::endl
;;
190 cout
<< "primary: " << *primary
<< std::endl
;;
192 void clean_pg_upmaps(CephContext
*cct
,
194 OSDMap::Incremental
& pending_inc
) {
196 int pgs_per_chunk
= 256;
197 ThreadPool
tp(cct
, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num
);
199 ParallelPGMapper
mapper(cct
, &tp
);
200 vector
<pg_t
> pgs_to_check
;
201 om
.get_upmap_pgs(&pgs_to_check
);
202 OSDMonitor::CleanUpmapJob
job(cct
, om
, pending_inc
);
203 mapper
.queue(&job
, pgs_per_chunk
, pgs_to_check
);
209 TEST_F(OSDMapTest
, Create
) {
211 ASSERT_EQ(get_num_osds(), (unsigned)osdmap
.get_max_osd());
212 ASSERT_EQ(get_num_osds(), osdmap
.get_num_in_osds());
215 TEST_F(OSDMapTest
, Features
) {
218 uint64_t features
= osdmap
.get_features(CEPH_ENTITY_TYPE_OSD
, NULL
);
219 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES
);
220 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES2
);
221 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES3
);
222 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_V2
);
223 ASSERT_TRUE(features
& CEPH_FEATURE_OSDHASHPSPOOL
);
224 ASSERT_TRUE(features
& CEPH_FEATURE_OSD_PRIMARY_AFFINITY
);
226 // clients have a slightly different view
227 features
= osdmap
.get_features(CEPH_ENTITY_TYPE_CLIENT
, NULL
);
228 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES
);
229 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES2
);
230 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES3
);
231 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_V2
);
232 ASSERT_TRUE(features
& CEPH_FEATURE_OSDHASHPSPOOL
);
233 ASSERT_TRUE(features
& CEPH_FEATURE_OSD_PRIMARY_AFFINITY
);
235 // remove teh EC pool, but leave the rule. add primary affinity.
237 OSDMap::Incremental
new_pool_inc(osdmap
.get_epoch() + 1);
238 new_pool_inc
.old_pools
.insert(osdmap
.lookup_pg_pool_name("ec"));
239 new_pool_inc
.new_primary_affinity
[0] = 0x8000;
240 osdmap
.apply_incremental(new_pool_inc
);
243 features
= osdmap
.get_features(CEPH_ENTITY_TYPE_MON
, NULL
);
244 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES
);
245 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES2
);
246 ASSERT_TRUE(features
& CEPH_FEATURE_CRUSH_TUNABLES3
); // shared bit with primary affinity
247 ASSERT_FALSE(features
& CEPH_FEATURE_CRUSH_V2
);
248 ASSERT_TRUE(features
& CEPH_FEATURE_OSDHASHPSPOOL
);
249 ASSERT_TRUE(features
& CEPH_FEATURE_OSD_PRIMARY_AFFINITY
);
251 // FIXME: test tiering feature bits
254 TEST_F(OSDMapTest
, MapPG
) {
257 std::cerr
<< " osdmap.pool_max==" << osdmap
.get_pool_max() << std::endl
;
258 pg_t
rawpg(0, my_rep_pool
);
259 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
260 vector
<int> up_osds
, acting_osds
;
261 int up_primary
, acting_primary
;
263 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
264 &acting_osds
, &acting_primary
);
266 vector
<int> old_up_osds
, old_acting_osds
;
267 osdmap
.pg_to_up_acting_osds(pgid
, old_up_osds
, old_acting_osds
);
268 ASSERT_EQ(old_up_osds
, up_osds
);
269 ASSERT_EQ(old_acting_osds
, acting_osds
);
271 ASSERT_EQ(osdmap
.get_pg_pool(my_rep_pool
)->get_size(), up_osds
.size());
274 TEST_F(OSDMapTest
, MapFunctionsMatch
) {
275 // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match
277 pg_t
rawpg(0, my_rep_pool
);
278 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
279 vector
<int> up_osds
, acting_osds
;
280 int up_primary
, acting_primary
;
282 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
283 &acting_osds
, &acting_primary
);
285 vector
<int> up_osds_two
, acting_osds_two
;
287 osdmap
.pg_to_up_acting_osds(pgid
, up_osds_two
, acting_osds_two
);
289 ASSERT_EQ(up_osds
, up_osds_two
);
290 ASSERT_EQ(acting_osds
, acting_osds_two
);
292 int acting_primary_two
;
293 osdmap
.pg_to_acting_osds(pgid
, &acting_osds_two
, &acting_primary_two
);
294 EXPECT_EQ(acting_osds
, acting_osds_two
);
295 EXPECT_EQ(acting_primary
, acting_primary_two
);
296 osdmap
.pg_to_acting_osds(pgid
, acting_osds_two
);
297 EXPECT_EQ(acting_osds
, acting_osds_two
);
300 /** This test must be removed or modified appropriately when we allow
301 * other ways to specify a primary. */
302 TEST_F(OSDMapTest
, PrimaryIsFirst
) {
305 pg_t
rawpg(0, my_rep_pool
);
306 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
307 vector
<int> up_osds
, acting_osds
;
308 int up_primary
, acting_primary
;
310 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
311 &acting_osds
, &acting_primary
);
312 EXPECT_EQ(up_osds
[0], up_primary
);
313 EXPECT_EQ(acting_osds
[0], acting_primary
);
316 TEST_F(OSDMapTest
, PGTempRespected
) {
319 pg_t
rawpg(0, my_rep_pool
);
320 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
321 vector
<int> up_osds
, acting_osds
;
322 int up_primary
, acting_primary
;
324 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
325 &acting_osds
, &acting_primary
);
327 // copy and swap first and last element in acting_osds
328 vector
<int> new_acting_osds(acting_osds
);
329 int first
= new_acting_osds
[0];
330 new_acting_osds
[0] = *new_acting_osds
.rbegin();
331 *new_acting_osds
.rbegin() = first
;
333 // apply pg_temp to osdmap
334 OSDMap::Incremental
pgtemp_map(osdmap
.get_epoch() + 1);
335 pgtemp_map
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
336 new_acting_osds
.begin(), new_acting_osds
.end());
337 osdmap
.apply_incremental(pgtemp_map
);
339 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
340 &acting_osds
, &acting_primary
);
341 EXPECT_EQ(new_acting_osds
, acting_osds
);
344 TEST_F(OSDMapTest
, PrimaryTempRespected
) {
347 pg_t
rawpg(0, my_rep_pool
);
348 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
350 vector
<int> acting_osds
;
351 int up_primary
, acting_primary
;
353 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
354 &acting_osds
, &acting_primary
);
356 // make second OSD primary via incremental
357 OSDMap::Incremental
pgtemp_map(osdmap
.get_epoch() + 1);
358 pgtemp_map
.new_primary_temp
[pgid
] = acting_osds
[1];
359 osdmap
.apply_incremental(pgtemp_map
);
361 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
362 &acting_osds
, &acting_primary
);
363 EXPECT_EQ(acting_primary
, acting_osds
[1]);
366 TEST_F(OSDMapTest
, CleanTemps
) {
369 OSDMap::Incremental
pgtemp_map(osdmap
.get_epoch() + 1);
370 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 2);
371 pg_t pga
= osdmap
.raw_pg_to_pg(pg_t(0, my_rep_pool
));
373 vector
<int> up_osds
, acting_osds
;
374 int up_primary
, acting_primary
;
375 osdmap
.pg_to_up_acting_osds(pga
, &up_osds
, &up_primary
,
376 &acting_osds
, &acting_primary
);
377 pgtemp_map
.new_pg_temp
[pga
] = mempool::osdmap::vector
<int>(
378 up_osds
.begin(), up_osds
.end());
379 pgtemp_map
.new_primary_temp
[pga
] = up_primary
;
381 pg_t pgb
= osdmap
.raw_pg_to_pg(pg_t(1, my_rep_pool
));
383 vector
<int> up_osds
, acting_osds
;
384 int up_primary
, acting_primary
;
385 osdmap
.pg_to_up_acting_osds(pgb
, &up_osds
, &up_primary
,
386 &acting_osds
, &acting_primary
);
387 pending_inc
.new_pg_temp
[pgb
] = mempool::osdmap::vector
<int>(
388 up_osds
.begin(), up_osds
.end());
389 pending_inc
.new_primary_temp
[pgb
] = up_primary
;
392 osdmap
.apply_incremental(pgtemp_map
);
395 tmpmap
.deepish_copy_from(osdmap
);
396 tmpmap
.apply_incremental(pending_inc
);
397 OSDMap::clean_temps(g_ceph_context
, osdmap
, tmpmap
, &pending_inc
);
399 EXPECT_TRUE(pending_inc
.new_pg_temp
.count(pga
) &&
400 pending_inc
.new_pg_temp
[pga
].size() == 0);
401 EXPECT_EQ(-1, pending_inc
.new_primary_temp
[pga
]);
403 EXPECT_TRUE(!pending_inc
.new_pg_temp
.count(pgb
) &&
404 !pending_inc
.new_primary_temp
.count(pgb
));
407 TEST_F(OSDMapTest
, KeepsNecessaryTemps
) {
410 pg_t
rawpg(0, my_rep_pool
);
411 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
412 vector
<int> up_osds
, acting_osds
;
413 int up_primary
, acting_primary
;
415 osdmap
.pg_to_up_acting_osds(pgid
, &up_osds
, &up_primary
,
416 &acting_osds
, &acting_primary
);
418 // find unused OSD and stick it in there
419 OSDMap::Incremental
pgtemp_map(osdmap
.get_epoch() + 1);
420 // find an unused osd and put it in place of the first one
422 for(; i
!= (int)get_num_osds(); ++i
) {
424 for (vector
<int>::iterator osd_it
= up_osds
.begin();
425 osd_it
!= up_osds
.end();
437 if (i
== (int)get_num_osds())
438 FAIL() << "did not find unused OSD for temp mapping";
440 pgtemp_map
.new_pg_temp
[pgid
] = mempool::osdmap::vector
<int>(
441 up_osds
.begin(), up_osds
.end());
442 pgtemp_map
.new_primary_temp
[pgid
] = up_osds
[1];
443 osdmap
.apply_incremental(pgtemp_map
);
445 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
448 tmpmap
.deepish_copy_from(osdmap
);
449 tmpmap
.apply_incremental(pending_inc
);
450 OSDMap::clean_temps(g_ceph_context
, osdmap
, tmpmap
, &pending_inc
);
451 EXPECT_FALSE(pending_inc
.new_pg_temp
.count(pgid
));
452 EXPECT_FALSE(pending_inc
.new_primary_temp
.count(pgid
));
455 TEST_F(OSDMapTest
, PrimaryAffinity
) {
458 int n
= get_num_osds();
459 for (map
<int64_t,pg_pool_t
>::const_iterator p
= osdmap
.get_pools().begin();
460 p
!= osdmap
.get_pools().end();
463 int expect_primary
= 10000 / n
;
464 cout
<< "pool " << pool
<< " size " << (int)p
->second
.size
465 << " expect_primary " << expect_primary
<< std::endl
;
467 vector
<int> any(n
, 0);
468 vector
<int> first(n
, 0);
469 vector
<int> primary(n
, 0);
470 test_mappings(pool
, 10000, &any
, &first
, &primary
);
471 for (int i
=0; i
<n
; ++i
) {
472 ASSERT_LT(0, any
[i
]);
473 ASSERT_LT(0, first
[i
]);
474 ASSERT_LT(0, primary
[i
]);
478 osdmap
.set_primary_affinity(0, 0);
479 osdmap
.set_primary_affinity(1, 0);
481 vector
<int> any(n
, 0);
482 vector
<int> first(n
, 0);
483 vector
<int> primary(n
, 0);
484 test_mappings(pool
, 10000, &any
, &first
, &primary
);
485 for (int i
=0; i
<n
; ++i
) {
486 ASSERT_LT(0, any
[i
]);
488 ASSERT_LT(0, first
[i
]);
489 ASSERT_LT(0, primary
[i
]);
491 if (p
->second
.is_replicated()) {
492 ASSERT_EQ(0, first
[i
]);
494 ASSERT_EQ(0, primary
[i
]);
499 osdmap
.set_primary_affinity(0, 0x8000);
500 osdmap
.set_primary_affinity(1, 0);
502 vector
<int> any(n
, 0);
503 vector
<int> first(n
, 0);
504 vector
<int> primary(n
, 0);
505 test_mappings(pool
, 10000, &any
, &first
, &primary
);
506 int expect
= (10000 / (n
-2)) / 2; // half weight
507 cout
<< "expect " << expect
<< std::endl
;
508 for (int i
=0; i
<n
; ++i
) {
509 ASSERT_LT(0, any
[i
]);
511 ASSERT_LT(0, first
[i
]);
512 ASSERT_LT(0, primary
[i
]);
514 if (p
->second
.is_replicated()) {
515 ASSERT_EQ(0, first
[i
]);
517 ASSERT_EQ(0, primary
[i
]);
519 ASSERT_LT(expect
*2/3, primary
[0]);
520 ASSERT_GT(expect
*4/3, primary
[0]);
525 osdmap
.set_primary_affinity(0, 0x10000);
526 osdmap
.set_primary_affinity(1, 0x10000);
530 TEST_F(OSDMapTest
, get_osd_crush_node_flags
) {
533 for (unsigned i
=0; i
<get_num_osds(); ++i
) {
534 ASSERT_EQ(0u, osdmap
.get_osd_crush_node_flags(i
));
537 OSDMap::Incremental
inc(osdmap
.get_epoch() + 1);
538 inc
.new_crush_node_flags
[-1] = 123u;
539 osdmap
.apply_incremental(inc
);
540 for (unsigned i
=0; i
<get_num_osds(); ++i
) {
541 ASSERT_EQ(123u, osdmap
.get_osd_crush_node_flags(i
));
543 ASSERT_EQ(0u, osdmap
.get_osd_crush_node_flags(1000));
545 OSDMap::Incremental
inc3(osdmap
.get_epoch() + 1);
546 inc3
.new_crush_node_flags
[-1] = 456u;
547 osdmap
.apply_incremental(inc3
);
548 for (unsigned i
=0; i
<get_num_osds(); ++i
) {
549 ASSERT_EQ(456u, osdmap
.get_osd_crush_node_flags(i
));
551 ASSERT_EQ(0u, osdmap
.get_osd_crush_node_flags(1000));
553 OSDMap::Incremental
inc2(osdmap
.get_epoch() + 1);
554 inc2
.new_crush_node_flags
[-1] = 0;
555 osdmap
.apply_incremental(inc2
);
556 for (unsigned i
=0; i
<get_num_osds(); ++i
) {
557 ASSERT_EQ(0u, osdmap
.get_crush_node_flags(i
));
561 TEST_F(OSDMapTest
, parse_osd_id_list
) {
565 osdmap
.get_all_osds(all
);
567 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"osd.0"}, &out
, &cout
));
568 ASSERT_EQ(1u, out
.size());
569 ASSERT_EQ(0, *out
.begin());
571 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"1"}, &out
, &cout
));
572 ASSERT_EQ(1u, out
.size());
573 ASSERT_EQ(1, *out
.begin());
575 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"osd.0","osd.1"}, &out
, &cout
));
576 ASSERT_EQ(2u, out
.size());
577 ASSERT_EQ(0, *out
.begin());
578 ASSERT_EQ(1, *out
.rbegin());
580 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"osd.0","1"}, &out
, &cout
));
581 ASSERT_EQ(2u, out
.size());
582 ASSERT_EQ(0, *out
.begin());
583 ASSERT_EQ(1, *out
.rbegin());
585 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"*"}, &out
, &cout
));
586 ASSERT_EQ(all
.size(), out
.size());
589 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"all"}, &out
, &cout
));
592 ASSERT_EQ(0, osdmap
.parse_osd_id_list({"any"}, &out
, &cout
));
595 ASSERT_EQ(-EINVAL
, osdmap
.parse_osd_id_list({"foo"}, &out
, &cout
));
596 ASSERT_EQ(-EINVAL
, osdmap
.parse_osd_id_list({"-12"}, &out
, &cout
));
599 TEST_F(OSDMapTest
, CleanPGUpmaps
) {
602 // build a crush rule of type host
603 const int expected_host_num
= 3;
604 int osd_per_host
= get_num_osds() / expected_host_num
;
605 ASSERT_GE(2, osd_per_host
);
607 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
608 if (i
&& i
% osd_per_host
== 0) {
611 stringstream osd_name
;
612 stringstream host_name
;
613 vector
<string
> move_to
;
614 osd_name
<< "osd." << i
;
615 host_name
<< "host-" << index
;
616 move_to
.push_back("root=default");
617 string host_loc
= "host=" + host_name
.str();
618 move_to
.push_back(host_loc
);
619 int r
= crush_move(osdmap
, osd_name
.str(), move_to
);
622 const string upmap_rule
= "upmap";
623 int upmap_rule_no
= crush_rule_create_replicated(
624 upmap_rule
, "default", "host");
625 ASSERT_LT(0, upmap_rule_no
);
627 // create a replicated pool which references the above rule
628 OSDMap::Incremental
new_pool_inc(osdmap
.get_epoch() + 1);
629 new_pool_inc
.new_pool_max
= osdmap
.get_pool_max();
630 new_pool_inc
.fsid
= osdmap
.get_fsid();
632 uint64_t upmap_pool_id
= ++new_pool_inc
.new_pool_max
;
633 pg_pool_t
*p
= new_pool_inc
.get_new_pool(upmap_pool_id
, &empty
);
637 p
->type
= pg_pool_t::TYPE_REPLICATED
;
638 p
->crush_rule
= upmap_rule_no
;
639 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
640 new_pool_inc
.new_pool_names
[upmap_pool_id
] = "upmap_pool";
641 osdmap
.apply_incremental(new_pool_inc
);
643 pg_t
rawpg(0, upmap_pool_id
);
644 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
647 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
648 ASSERT_LT(1U, up
.size());
650 // validate we won't have two OSDs from a same host
651 int parent_0
= osdmap
.crush
->get_parent_of_type(up
[0],
652 osdmap
.crush
->get_type_id("host"));
653 int parent_1
= osdmap
.crush
->get_parent_of_type(up
[1],
654 osdmap
.crush
->get_type_id("host"));
655 ASSERT_TRUE(parent_0
!= parent_1
);
659 // cancel stale upmaps
660 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
662 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
663 if (std::find(up
.begin(), up
.end(), i
) == up
.end()) {
668 ASSERT_TRUE(from
>= 0);
670 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
671 if (std::find(up
.begin(), up
.end(), i
) == up
.end() && i
!= from
) {
676 ASSERT_TRUE(to
>= 0);
677 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
678 new_pg_upmap_items
.push_back(make_pair(from
, to
));
679 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
680 pending_inc
.new_pg_upmap_items
[pgid
] =
681 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
682 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
684 nextmap
.deepish_copy_from(osdmap
);
685 nextmap
.apply_incremental(pending_inc
);
686 ASSERT_TRUE(nextmap
.have_pg_upmaps(pgid
));
687 OSDMap::Incremental
new_pending_inc(nextmap
.get_epoch() + 1);
688 clean_pg_upmaps(g_ceph_context
, nextmap
, new_pending_inc
);
689 nextmap
.apply_incremental(new_pending_inc
);
690 ASSERT_TRUE(!nextmap
.have_pg_upmaps(pgid
));
694 // https://tracker.ceph.com/issues/37493
695 pg_t
ec_pg(0, my_ec_pool
);
696 pg_t ec_pgid
= osdmap
.raw_pg_to_pg(ec_pg
);
697 OSDMap tmpmap
; // use a tmpmap here, so we do not dirty origin map..
701 // insert a valid pg_upmap_item
704 osdmap
.pg_to_raw_up(ec_pgid
, &ec_up
, &ec_up_primary
);
705 ASSERT_TRUE(!ec_up
.empty());
706 from
= *(ec_up
.begin());
707 ASSERT_TRUE(from
>= 0);
708 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
709 if (std::find(ec_up
.begin(), ec_up
.end(), i
) == ec_up
.end()) {
714 ASSERT_TRUE(to
>= 0);
715 ASSERT_TRUE(from
!= to
);
716 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
717 new_pg_upmap_items
.push_back(make_pair(from
, to
));
718 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
719 pending_inc
.new_pg_upmap_items
[ec_pgid
] =
720 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
721 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
722 tmpmap
.deepish_copy_from(osdmap
);
723 tmpmap
.apply_incremental(pending_inc
);
724 ASSERT_TRUE(tmpmap
.have_pg_upmaps(ec_pgid
));
727 // mark one of the target OSDs of the above pg_upmap_item as down
728 OSDMap::Incremental
pending_inc(tmpmap
.get_epoch() + 1);
729 pending_inc
.new_state
[to
] = CEPH_OSD_UP
;
730 tmpmap
.apply_incremental(pending_inc
);
731 ASSERT_TRUE(!tmpmap
.is_up(to
));
732 ASSERT_TRUE(tmpmap
.have_pg_upmaps(ec_pgid
));
735 // confirm *clean_pg_upmaps* won't do anything bad
736 OSDMap::Incremental
pending_inc(tmpmap
.get_epoch() + 1);
737 clean_pg_upmaps(g_ceph_context
, tmpmap
, pending_inc
);
738 tmpmap
.apply_incremental(pending_inc
);
739 ASSERT_TRUE(tmpmap
.have_pg_upmaps(ec_pgid
));
744 // http://tracker.ceph.com/issues/37501
745 pg_t
ec_pg(0, my_ec_pool
);
746 pg_t ec_pgid
= osdmap
.raw_pg_to_pg(ec_pg
);
747 OSDMap tmpmap
; // use a tmpmap here, so we do not dirty origin map..
751 // insert a valid pg_upmap_item
754 osdmap
.pg_to_raw_up(ec_pgid
, &ec_up
, &ec_up_primary
);
755 ASSERT_TRUE(!ec_up
.empty());
756 from
= *(ec_up
.begin());
757 ASSERT_TRUE(from
>= 0);
758 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
759 if (std::find(ec_up
.begin(), ec_up
.end(), i
) == ec_up
.end()) {
764 ASSERT_TRUE(to
>= 0);
765 ASSERT_TRUE(from
!= to
);
766 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
767 new_pg_upmap_items
.push_back(make_pair(from
, to
));
768 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
769 pending_inc
.new_pg_upmap_items
[ec_pgid
] =
770 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
771 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
772 tmpmap
.deepish_copy_from(osdmap
);
773 tmpmap
.apply_incremental(pending_inc
);
774 ASSERT_TRUE(tmpmap
.have_pg_upmaps(ec_pgid
));
777 // mark one of the target OSDs of the above pg_upmap_item as out
778 OSDMap::Incremental
pending_inc(tmpmap
.get_epoch() + 1);
779 pending_inc
.new_weight
[to
] = CEPH_OSD_OUT
;
780 tmpmap
.apply_incremental(pending_inc
);
781 ASSERT_TRUE(tmpmap
.is_out(to
));
782 ASSERT_TRUE(tmpmap
.have_pg_upmaps(ec_pgid
));
785 // *clean_pg_upmaps* should be able to remove the above *bad* mapping
786 OSDMap::Incremental
pending_inc(tmpmap
.get_epoch() + 1);
787 clean_pg_upmaps(g_ceph_context
, tmpmap
, pending_inc
);
788 tmpmap
.apply_incremental(pending_inc
);
789 ASSERT_TRUE(!tmpmap
.have_pg_upmaps(ec_pgid
));
794 // http://tracker.ceph.com/issues/37968
796 // build a temporary crush topology of 2 hosts, 3 osds per host
797 OSDMap tmp
; // use a tmpmap here, so we do not dirty origin map..
798 tmp
.deepish_copy_from(osdmap
);
799 const int expected_host_num
= 2;
800 int osd_per_host
= get_num_osds() / expected_host_num
;
801 ASSERT_GE(osd_per_host
, 3);
803 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
804 if (i
&& i
% osd_per_host
== 0) {
807 stringstream osd_name
;
808 stringstream host_name
;
809 vector
<string
> move_to
;
810 osd_name
<< "osd." << i
;
811 host_name
<< "host-" << index
;
812 move_to
.push_back("root=default");
813 string host_loc
= "host=" + host_name
.str();
814 move_to
.push_back(host_loc
);
815 auto r
= crush_move(tmp
, osd_name
.str(), move_to
);
821 get_crush(tmp
, crush
);
822 string rule_name
= "rule_37968";
823 int rule_type
= pg_pool_t::TYPE_ERASURE
;
824 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
826 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
827 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
830 string root_name
= "default";
831 int root
= crush
.get_item_id(root_name
);
835 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
837 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
838 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
839 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, root
, 0);
840 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSE_INDEP
, 2, 1 /* host*/);
841 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSE_INDEP
, 2, 0 /* osd */);
842 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
843 ASSERT_TRUE(step
== steps
);
844 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
846 crush
.set_rule_name(rno
, rule_name
);
848 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
849 pending_inc
.crush
.clear();
850 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
851 tmp
.apply_incremental(pending_inc
);
854 // create a erasuce-coded pool referencing the above rule
857 OSDMap::Incremental
new_pool_inc(tmp
.get_epoch() + 1);
858 new_pool_inc
.new_pool_max
= tmp
.get_pool_max();
859 new_pool_inc
.fsid
= tmp
.get_fsid();
861 pool_37968
= ++new_pool_inc
.new_pool_max
;
862 pg_pool_t
*p
= new_pool_inc
.get_new_pool(pool_37968
, &empty
);
866 p
->type
= pg_pool_t::TYPE_ERASURE
;
868 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
869 new_pool_inc
.new_pool_names
[pool_37968
] = "pool_37968";
870 tmp
.apply_incremental(new_pool_inc
);
873 pg_t
ec_pg(0, pool_37968
);
874 pg_t ec_pgid
= tmp
.raw_pg_to_pg(ec_pg
);
878 // insert a valid pg_upmap_item
881 tmp
.pg_to_raw_up(ec_pgid
, &ec_up
, &ec_up_primary
);
882 ASSERT_TRUE(ec_up
.size() == 4);
883 from
= *(ec_up
.begin());
884 ASSERT_TRUE(from
>= 0);
885 auto parent
= tmp
.crush
->get_parent_of_type(from
, 1 /* host */, rno
);
886 ASSERT_TRUE(parent
< 0);
887 // pick an osd of the same parent with *from*
888 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
889 if (std::find(ec_up
.begin(), ec_up
.end(), i
) == ec_up
.end()) {
890 auto p
= tmp
.crush
->get_parent_of_type(i
, 1 /* host */, rno
);
897 ASSERT_TRUE(to
>= 0);
898 ASSERT_TRUE(from
!= to
);
899 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
900 new_pg_upmap_items
.push_back(make_pair(from
, to
));
901 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
902 pending_inc
.new_pg_upmap_items
[ec_pgid
] =
903 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
904 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
905 tmp
.apply_incremental(pending_inc
);
906 ASSERT_TRUE(tmp
.have_pg_upmaps(ec_pgid
));
909 // *clean_pg_upmaps* should not remove the above upmap_item
910 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
911 clean_pg_upmaps(g_ceph_context
, tmp
, pending_inc
);
912 tmp
.apply_incremental(pending_inc
);
913 ASSERT_TRUE(tmp
.have_pg_upmaps(ec_pgid
));
920 // STEP-1: enumerate all children of up[0]'s parent,
921 // replace up[1] with one of them (other than up[0])
922 int parent
= osdmap
.crush
->get_parent_of_type(up
[0],
923 osdmap
.crush
->get_type_id("host"));
925 osdmap
.crush
->get_leaves(osdmap
.crush
->get_item_name(parent
), &candidates
);
926 ASSERT_LT(1U, candidates
.size());
927 int replaced_by
= -1;
928 for (auto c
: candidates
) {
935 // Check we can handle a negative pg_upmap value
936 vector
<int32_t> new_pg_upmap
;
937 new_pg_upmap
.push_back(up
[0]);
938 new_pg_upmap
.push_back(-823648512);
939 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
940 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
941 new_pg_upmap
.begin(), new_pg_upmap
.end());
942 osdmap
.apply_incremental(pending_inc
);
945 // crucial call - _apply_upmap should ignore the negative value
946 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
948 ASSERT_NE(-1, replaced_by
);
949 // generate a new pg_upmap item and apply
950 vector
<int32_t> new_pg_upmap
;
951 new_pg_upmap
.push_back(up
[0]);
952 new_pg_upmap
.push_back(replaced_by
); // up[1] -> replaced_by
953 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
954 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
955 new_pg_upmap
.begin(), new_pg_upmap
.end());
956 osdmap
.apply_incremental(pending_inc
);
958 // validate pg_upmap is there
961 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
962 ASSERT_EQ(new_up
.size(), up
.size());
963 ASSERT_EQ(new_up
[0], new_pg_upmap
[0]);
964 ASSERT_EQ(new_up
[1], new_pg_upmap
[1]);
965 // and we shall have two OSDs from a same host now..
966 int parent_0
= osdmap
.crush
->get_parent_of_type(new_up
[0],
967 osdmap
.crush
->get_type_id("host"));
968 int parent_1
= osdmap
.crush
->get_parent_of_type(new_up
[1],
969 osdmap
.crush
->get_type_id("host"));
970 ASSERT_EQ(parent_0
, parent_1
);
974 // STEP-2: apply cure
975 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
976 clean_pg_upmaps(g_ceph_context
, osdmap
, pending_inc
);
977 osdmap
.apply_incremental(pending_inc
);
979 // validate pg_upmap is gone (reverted)
982 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
983 ASSERT_EQ(new_up
, up
);
984 ASSERT_EQ(new_up_primary
, up_primary
);
990 // TEST pg_upmap_items
991 // enumerate all used hosts first
994 int parent
= osdmap
.crush
->get_parent_of_type(u
,
995 osdmap
.crush
->get_type_id("host"));
996 ASSERT_GT(0, parent
);
997 parents
.insert(parent
);
999 int candidate_parent
= 0;
1000 set
<int> candidate_children
;
1001 vector
<int> up_after_out
;
1003 // STEP-1: try mark out up[1] and all other OSDs from the same host
1004 int parent
= osdmap
.crush
->get_parent_of_type(up
[1],
1005 osdmap
.crush
->get_type_id("host"));
1007 osdmap
.crush
->get_leaves(osdmap
.crush
->get_item_name(parent
),
1009 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1010 for (auto c
: children
) {
1011 pending_inc
.new_weight
[c
] = CEPH_OSD_OUT
;
1014 tmpmap
.deepish_copy_from(osdmap
);
1015 tmpmap
.apply_incremental(pending_inc
);
1018 tmpmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
1019 // verify that we'll have OSDs from a different host..
1020 int will_choose
= -1;
1021 for (auto o
: new_up
) {
1022 int parent
= tmpmap
.crush
->get_parent_of_type(o
,
1023 osdmap
.crush
->get_type_id("host"));
1024 if (!parents
.count(parent
)) {
1026 candidate_parent
= parent
; // record
1030 ASSERT_LT(-1, will_choose
); // it is an OSD!
1031 ASSERT_NE(candidate_parent
, 0);
1032 osdmap
.crush
->get_leaves(osdmap
.crush
->get_item_name(candidate_parent
),
1033 &candidate_children
);
1034 ASSERT_TRUE(candidate_children
.count(will_choose
));
1035 candidate_children
.erase(will_choose
);
1036 ASSERT_FALSE(candidate_children
.empty());
1037 up_after_out
= new_up
; // needed for verification..
1040 // Make sure we can handle a negative pg_upmap_item
1042 int replaced_by
= -823648512;
1043 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1044 new_pg_upmap_items
.push_back(make_pair(victim
, replaced_by
));
1046 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1047 pending_inc
.new_pg_upmap_items
[pgid
] =
1048 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1049 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1050 osdmap
.apply_incremental(pending_inc
);
1053 // crucial call - _apply_upmap should ignore the negative value
1054 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
1057 // STEP-2: generating a new pg_upmap_items entry by
1058 // replacing up[0] with one coming from candidate_children
1060 int replaced_by
= *candidate_children
.begin();
1061 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1062 new_pg_upmap_items
.push_back(make_pair(victim
, replaced_by
));
1064 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1065 pending_inc
.new_pg_upmap_items
[pgid
] =
1066 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1067 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1068 osdmap
.apply_incremental(pending_inc
);
1070 // validate pg_upmap_items is there
1073 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
1074 ASSERT_EQ(new_up
.size(), up
.size());
1075 ASSERT_TRUE(std::find(new_up
.begin(), new_up
.end(), replaced_by
) !=
1078 ASSERT_TRUE(std::find(new_up
.begin(), new_up
.end(), up
[1]) !=
1083 // STEP-3: mark out up[1] and all other OSDs from the same host
1084 int parent
= osdmap
.crush
->get_parent_of_type(up
[1],
1085 osdmap
.crush
->get_type_id("host"));
1087 osdmap
.crush
->get_leaves(osdmap
.crush
->get_item_name(parent
),
1089 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1090 for (auto c
: children
) {
1091 pending_inc
.new_weight
[c
] = CEPH_OSD_OUT
;
1093 osdmap
.apply_incremental(pending_inc
);
1095 // validate we have two OSDs from the same host now..
1098 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
1099 ASSERT_EQ(up
.size(), new_up
.size());
1100 int parent_0
= osdmap
.crush
->get_parent_of_type(new_up
[0],
1101 osdmap
.crush
->get_type_id("host"));
1102 int parent_1
= osdmap
.crush
->get_parent_of_type(new_up
[1],
1103 osdmap
.crush
->get_type_id("host"));
1104 ASSERT_EQ(parent_0
, parent_1
);
1108 // STEP-4: apply cure
1109 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1110 clean_pg_upmaps(g_ceph_context
, osdmap
, pending_inc
);
1111 osdmap
.apply_incremental(pending_inc
);
1113 // validate pg_upmap_items is gone (reverted)
1116 osdmap
.pg_to_raw_up(pgid
, &new_up
, &new_up_primary
);
1117 ASSERT_EQ(new_up
, up_after_out
);
1123 TEST_F(OSDMapTest
, BUG_38897
) {
1124 // http://tracker.ceph.com/issues/38897
1125 // build a fresh map with 12 OSDs, without any default pools
1126 set_up_map(12, true);
1127 const string
pool_1("pool1");
1128 const string
pool_2("pool2");
1129 int64_t pool_1_id
= -1;
1132 // build customized crush rule for "pool1"
1133 string host_name
= "host_for_pool_1";
1134 // build a customized host to capture osd.1~5
1135 for (int i
= 1; i
< 5; i
++) {
1136 stringstream osd_name
;
1137 vector
<string
> move_to
;
1138 osd_name
<< "osd." << i
;
1139 move_to
.push_back("root=default");
1140 string host_loc
= "host=" + host_name
;
1141 move_to
.push_back(host_loc
);
1142 auto r
= crush_move(osdmap
, osd_name
.str(), move_to
);
1146 get_crush(osdmap
, crush
);
1147 auto host_id
= crush
.get_item_id(host_name
);
1148 ASSERT_TRUE(host_id
< 0);
1149 string rule_name
= "rule_for_pool1";
1150 int rule_type
= pg_pool_t::TYPE_REPLICATED
;
1151 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
1153 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
1154 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
1160 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
1162 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1163 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1164 // always choose osd.0
1165 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, 0, 0);
1166 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1167 // then pick any other random osds
1168 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, host_id
, 0);
1169 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSELEAF_FIRSTN
, 2, 0);
1170 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1171 ASSERT_TRUE(step
== steps
);
1172 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
1173 ASSERT_TRUE(r
>= 0);
1174 crush
.set_rule_name(rno
, rule_name
);
1176 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1177 pending_inc
.crush
.clear();
1178 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1179 osdmap
.apply_incremental(pending_inc
);
1183 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1184 pending_inc
.new_pool_max
= osdmap
.get_pool_max();
1185 auto pool_id
= ++pending_inc
.new_pool_max
;
1186 pool_1_id
= pool_id
;
1188 auto p
= pending_inc
.get_new_pool(pool_id
, &empty
);
1193 p
->type
= pg_pool_t::TYPE_REPLICATED
;
1194 p
->crush_rule
= rno
;
1195 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1196 pending_inc
.new_pool_names
[pool_id
] = pool_1
;
1197 osdmap
.apply_incremental(pending_inc
);
1198 ASSERT_TRUE(osdmap
.have_pg_pool(pool_id
));
1199 ASSERT_TRUE(osdmap
.get_pool_name(pool_id
) == pool_1
);
1201 for (unsigned i
= 0; i
< 3; i
++) {
1203 pg_t
rawpg(i
, pool_id
);
1204 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
1207 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
1208 ASSERT_TRUE(up
.size() == 3);
1209 ASSERT_TRUE(up
[0] == 0);
1211 // insert a new pg_upmap
1212 vector
<int32_t> new_up
;
1213 // and remap 1.x to osd.1 only
1214 // this way osd.0 is deemed to be *underfull*
1215 // and osd.1 is deemed to be *overfull*
1216 new_up
.push_back(1);
1218 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1219 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
1220 new_up
.begin(), new_up
.end());
1221 osdmap
.apply_incremental(pending_inc
);
1223 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
1224 ASSERT_TRUE(up
.size() == 1);
1225 ASSERT_TRUE(up
[0] == 1);
1231 // build customized crush rule for "pool2"
1232 string host_name
= "host_for_pool_2";
1233 // build a customized host to capture osd.6~11
1234 for (int i
= 6; i
< (int)get_num_osds(); i
++) {
1235 stringstream osd_name
;
1236 vector
<string
> move_to
;
1237 osd_name
<< "osd." << i
;
1238 move_to
.push_back("root=default");
1239 string host_loc
= "host=" + host_name
;
1240 move_to
.push_back(host_loc
);
1241 auto r
= crush_move(osdmap
, osd_name
.str(), move_to
);
1245 get_crush(osdmap
, crush
);
1246 auto host_id
= crush
.get_item_id(host_name
);
1247 ASSERT_TRUE(host_id
< 0);
1248 string rule_name
= "rule_for_pool2";
1249 int rule_type
= pg_pool_t::TYPE_REPLICATED
;
1250 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
1252 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
1253 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
1259 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
1261 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1262 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1263 // always choose osd.0
1264 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, 0, 0);
1265 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1266 // then pick any other random osds
1267 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, host_id
, 0);
1268 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSELEAF_FIRSTN
, 2, 0);
1269 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1270 ASSERT_TRUE(step
== steps
);
1271 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
1272 ASSERT_TRUE(r
>= 0);
1273 crush
.set_rule_name(rno
, rule_name
);
1275 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1276 pending_inc
.crush
.clear();
1277 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1278 osdmap
.apply_incremental(pending_inc
);
1282 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1283 pending_inc
.new_pool_max
= osdmap
.get_pool_max();
1284 auto pool_id
= ++pending_inc
.new_pool_max
;
1286 auto p
= pending_inc
.get_new_pool(pool_id
, &empty
);
1288 // include a single PG
1291 p
->type
= pg_pool_t::TYPE_REPLICATED
;
1292 p
->crush_rule
= rno
;
1293 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1294 pending_inc
.new_pool_names
[pool_id
] = pool_2
;
1295 osdmap
.apply_incremental(pending_inc
);
1296 ASSERT_TRUE(osdmap
.have_pg_pool(pool_id
));
1297 ASSERT_TRUE(osdmap
.get_pool_name(pool_id
) == pool_2
);
1298 pg_t
rawpg(0, pool_id
);
1299 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
1300 EXPECT_TRUE(!osdmap
.have_pg_upmaps(pgid
));
1303 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
1304 ASSERT_TRUE(up
.size() == 3);
1305 ASSERT_TRUE(up
[0] == 0);
1308 // build a pg_upmap_item that will
1309 // remap pg out from *underfull* osd.0
1310 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1311 new_pg_upmap_items
.push_back(make_pair(0, 10)); // osd.0 -> osd.10
1312 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1313 pending_inc
.new_pg_upmap_items
[pgid
] =
1314 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1315 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1316 osdmap
.apply_incremental(pending_inc
);
1317 ASSERT_TRUE(osdmap
.have_pg_upmaps(pgid
));
1320 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
1321 ASSERT_TRUE(up
.size() == 3);
1322 ASSERT_TRUE(up
[0] == 10);
1328 set
<int64_t> only_pools
;
1329 ASSERT_TRUE(pool_1_id
>= 0);
1330 only_pools
.insert(pool_1_id
);
1331 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1332 // require perfect distribution! (max deviation 0)
1333 osdmap
.calc_pg_upmaps(g_ceph_context
,
1334 0, // so we can force optimizing
1338 osdmap
.apply_incremental(pending_inc
);
1342 TEST_F(OSDMapTest
, BUG_40104
) {
1343 // http://tracker.ceph.com/issues/40104
1344 int big_osd_num
= 5000;
1345 int big_pg_num
= 10000;
1346 set_up_map(big_osd_num
, true);
1349 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1350 pending_inc
.new_pool_max
= osdmap
.get_pool_max();
1351 pool_id
= ++pending_inc
.new_pool_max
;
1353 auto p
= pending_inc
.get_new_pool(pool_id
, &empty
);
1356 p
->set_pg_num(big_pg_num
);
1357 p
->set_pgp_num(big_pg_num
);
1358 p
->type
= pg_pool_t::TYPE_REPLICATED
;
1360 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1361 pending_inc
.new_pool_names
[pool_id
] = "big_pool";
1362 osdmap
.apply_incremental(pending_inc
);
1363 ASSERT_TRUE(osdmap
.have_pg_pool(pool_id
));
1364 ASSERT_TRUE(osdmap
.get_pool_name(pool_id
) == "big_pool");
1367 // generate pg_upmap_items for each pg
1368 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1369 for (int i
= 0; i
< big_pg_num
; i
++) {
1370 pg_t
rawpg(i
, pool_id
);
1371 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
1374 osdmap
.pg_to_raw_up(pgid
, &up
, &up_primary
);
1375 ASSERT_TRUE(up
.size() == 3);
1377 int replaced_by
= random() % big_osd_num
;
1378 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1379 // note that it might or might not be valid, we don't care
1380 new_pg_upmap_items
.push_back(make_pair(victim
, replaced_by
));
1381 pending_inc
.new_pg_upmap_items
[pgid
] =
1382 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1383 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1385 osdmap
.apply_incremental(pending_inc
);
1388 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1389 auto start
= mono_clock::now();
1390 clean_pg_upmaps(g_ceph_context
, osdmap
, pending_inc
);
1391 auto latency
= mono_clock::now() - start
;
1392 std::cout
<< "clean_pg_upmaps (~" << big_pg_num
1393 << " pg_upmap_items) latency:" << timespan_str(latency
)
1398 TEST_F(OSDMapTest
, BUG_42052
) {
1399 // https://tracker.ceph.com/issues/42052
1400 set_up_map(6, true);
1401 const string
pool_name("pool");
1402 // build customized crush rule for "pool"
1404 get_crush(osdmap
, crush
);
1405 string rule_name
= "rule";
1406 int rule_type
= pg_pool_t::TYPE_REPLICATED
;
1407 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
1409 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
1410 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
1416 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
1418 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1419 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1420 // always choose osd.0, osd.1, osd.2
1421 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, 0, 0);
1422 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1423 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, 0, 1);
1424 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1425 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, 0, 2);
1426 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1427 ASSERT_TRUE(step
== steps
);
1428 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
1429 ASSERT_TRUE(r
>= 0);
1430 crush
.set_rule_name(rno
, rule_name
);
1432 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1433 pending_inc
.crush
.clear();
1434 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1435 osdmap
.apply_incremental(pending_inc
);
1439 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1440 pending_inc
.new_pool_max
= osdmap
.get_pool_max();
1441 auto pool_id
= ++pending_inc
.new_pool_max
;
1443 auto p
= pending_inc
.get_new_pool(pool_id
, &empty
);
1448 p
->type
= pg_pool_t::TYPE_REPLICATED
;
1449 p
->crush_rule
= rno
;
1450 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1451 pending_inc
.new_pool_names
[pool_id
] = pool_name
;
1452 osdmap
.apply_incremental(pending_inc
);
1453 ASSERT_TRUE(osdmap
.have_pg_pool(pool_id
));
1454 ASSERT_TRUE(osdmap
.get_pool_name(pool_id
) == pool_name
);
1455 pg_t
rawpg(0, pool_id
);
1456 pg_t pgid
= osdmap
.raw_pg_to_pg(rawpg
);
1458 // pg_upmap 1.0 [2,3,5]
1459 vector
<int32_t> new_up
{2,3,5};
1460 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1461 pending_inc
.new_pg_upmap
[pgid
] = mempool::osdmap::vector
<int32_t>(
1462 new_up
.begin(), new_up
.end());
1463 osdmap
.apply_incremental(pending_inc
);
1466 // pg_upmap_items 1.0 [0,3,4,5]
1467 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1468 new_pg_upmap_items
.push_back(make_pair(0, 3));
1469 new_pg_upmap_items
.push_back(make_pair(4, 5));
1470 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1471 pending_inc
.new_pg_upmap_items
[pgid
] =
1472 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1473 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1474 osdmap
.apply_incremental(pending_inc
);
1477 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1478 clean_pg_upmaps(g_ceph_context
, osdmap
, pending_inc
);
1479 osdmap
.apply_incremental(pending_inc
);
1480 ASSERT_FALSE(osdmap
.have_pg_upmaps(pgid
));
1484 TEST_F(OSDMapTest
, BUG_42485
) {
1487 // build a temporary crush topology of 2datacenters, 3racks per dc,
1488 // 1host per rack, 10osds per host
1489 OSDMap tmp
; // use a tmpmap here, so we do not dirty origin map..
1490 tmp
.deepish_copy_from(osdmap
);
1491 const int expected_host_num
= 6;
1492 int osd_per_host
= (int)get_num_osds() / expected_host_num
;
1493 ASSERT_GE(osd_per_host
, 10);
1494 int host_per_dc
= 3;
1497 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
1498 if (i
&& i
% osd_per_host
== 0) {
1501 if (i
&& i
% (host_per_dc
* osd_per_host
) == 0) {
1504 stringstream osd_name
;
1505 stringstream host_name
;
1506 stringstream rack_name
;
1507 stringstream dc_name
;
1508 vector
<string
> move_to
;
1509 osd_name
<< "osd." << i
;
1510 host_name
<< "host-" << index
;
1511 rack_name
<< "rack-" << index
;
1512 dc_name
<< "dc-" << dc_index
;
1513 move_to
.push_back("root=default");
1514 string dc_loc
= "datacenter=" + dc_name
.str();
1515 move_to
.push_back(dc_loc
);
1516 string rack_loc
= "rack=" + rack_name
.str();
1517 move_to
.push_back(rack_loc
);
1518 string host_loc
= "host=" + host_name
.str();
1519 move_to
.push_back(host_loc
);
1520 auto r
= crush_move(tmp
, osd_name
.str(), move_to
);
1526 get_crush(tmp
, crush
);
1527 string rule_name
= "rule_xeus_993_1";
1528 int rule_type
= pg_pool_t::TYPE_REPLICATED
;
1529 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
1531 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
1532 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
1535 string root_name
= "default";
1536 string dc_1
= "dc-0";
1537 int dc1
= crush
.get_item_id(dc_1
);
1538 string dc_2
= "dc-1";
1539 int dc2
= crush
.get_item_id(dc_2
);
1543 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
1545 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1546 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1547 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, dc1
, 0);
1548 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSELEAF_FIRSTN
, 2, 3 /* rack */);
1549 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1550 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, dc2
, 0);
1551 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSELEAF_FIRSTN
, 2, 3 /* rack */);
1552 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1553 ASSERT_TRUE(step
== steps
);
1554 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
1555 ASSERT_TRUE(r
>= 0);
1556 crush
.set_rule_name(rno
, rule_name
);
1558 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1559 pending_inc
.crush
.clear();
1560 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1561 tmp
.apply_incremental(pending_inc
);
1563 // create a repliacted pool referencing the above rule
1564 int64_t pool_xeus_993
;
1566 OSDMap::Incremental
new_pool_inc(tmp
.get_epoch() + 1);
1567 new_pool_inc
.new_pool_max
= tmp
.get_pool_max();
1568 new_pool_inc
.fsid
= tmp
.get_fsid();
1570 pool_xeus_993
= ++new_pool_inc
.new_pool_max
;
1571 pg_pool_t
*p
= new_pool_inc
.get_new_pool(pool_xeus_993
, &empty
);
1573 p
->set_pg_num(4096);
1574 p
->set_pgp_num(4096);
1575 p
->type
= pg_pool_t::TYPE_REPLICATED
;
1576 p
->crush_rule
= rno
;
1577 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1578 new_pool_inc
.new_pool_names
[pool_xeus_993
] = "pool_xeus_993";
1579 tmp
.apply_incremental(new_pool_inc
);
1582 pg_t
rep_pg(0, pool_xeus_993
);
1583 pg_t rep_pgid
= tmp
.raw_pg_to_pg(rep_pg
);
1589 tmp
.pg_to_raw_up(rep_pgid
, &rep_up
, &rep_up_primary
);
1590 std::cout
<< "pgid " << rep_up
<< " up " << rep_up
<< std::endl
;
1591 ASSERT_TRUE(rep_up
.size() == 4);
1592 from
= *(rep_up
.begin());
1593 ASSERT_TRUE(from
>= 0);
1594 auto dc_parent
= tmp
.crush
->get_parent_of_type(from
, 8 /* dc */, rno
);
1595 if (dc_parent
== dc1
)
1599 auto rack_parent
= tmp
.crush
->get_parent_of_type(from
, 3 /* rack */, rno
);
1600 ASSERT_TRUE(dc_parent
< 0);
1601 ASSERT_TRUE(rack_parent
< 0);
1602 set
<int> rack_parents
;
1603 for (auto &i
: rep_up
) {
1604 if (i
== from
) continue;
1605 auto rack_parent
= tmp
.crush
->get_parent_of_type(i
, 3 /* rack */, rno
);
1606 rack_parents
.insert(rack_parent
);
1608 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
1609 if (std::find(rep_up
.begin(), rep_up
.end(), i
) == rep_up
.end()) {
1610 auto dc_p
= tmp
.crush
->get_parent_of_type(i
, 8 /* dc */, rno
);
1611 auto rack_p
= tmp
.crush
->get_parent_of_type(i
, 3 /* rack */, rno
);
1612 if (dc_p
== dc_parent
&&
1613 rack_parents
.find(rack_p
) == rack_parents
.end()) {
1619 ASSERT_TRUE(to
>= 0);
1620 ASSERT_TRUE(from
!= to
);
1621 std::cout
<< "from " << from
<< " to " << to
<< std::endl
;
1622 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1623 new_pg_upmap_items
.push_back(make_pair(from
, to
));
1624 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1625 pending_inc
.new_pg_upmap_items
[rep_pgid
] =
1626 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1627 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1628 tmp
.apply_incremental(pending_inc
);
1629 ASSERT_TRUE(tmp
.have_pg_upmaps(rep_pgid
));
1631 pg_t
rep_pg2(2, pool_xeus_993
);
1632 pg_t rep_pgid2
= tmp
.raw_pg_to_pg(rep_pg2
);
1634 pg_t rep_pgid
= rep_pgid2
;
1635 vector
<int> from_osds
{-1, -1};
1638 tmp
.pg_to_raw_up(rep_pgid
, &rep_up
, &rep_up_primary
);
1639 ASSERT_TRUE(rep_up
.size() == 4);
1640 from_osds
[0] = *(rep_up
.begin());
1641 from_osds
[1] = *(rep_up
.rbegin());
1642 std::cout
<< "pgid " << rep_pgid2
<< " up " << rep_up
<< std::endl
;
1643 ASSERT_TRUE(*(from_osds
.begin()) >= 0);
1644 ASSERT_TRUE(*(from_osds
.rbegin()) >= 0);
1645 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1646 for (auto &from
: from_osds
) {
1648 auto dc_parent
= tmp
.crush
->get_parent_of_type(from
, 8 /* dc */, rno
);
1649 if (dc_parent
== dc1
)
1653 auto rack_parent
= tmp
.crush
->get_parent_of_type(from
, 3 /* rack */, rno
);
1654 ASSERT_TRUE(dc_parent
< 0);
1655 ASSERT_TRUE(rack_parent
< 0);
1656 set
<int> rack_parents
;
1657 for (auto &i
: rep_up
) {
1658 if (i
== from
) continue;
1659 auto rack_parent
= tmp
.crush
->get_parent_of_type(i
, 3 /* rack */, rno
);
1660 rack_parents
.insert(rack_parent
);
1662 for (auto &i
: new_pg_upmap_items
) {
1663 auto rack_from
= tmp
.crush
->get_parent_of_type(i
.first
, 3, rno
);
1664 auto rack_to
= tmp
.crush
->get_parent_of_type(i
.second
, 3, rno
);
1665 rack_parents
.insert(rack_from
);
1666 rack_parents
.insert(rack_to
);
1668 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
1669 if (std::find(rep_up
.begin(), rep_up
.end(), i
) == rep_up
.end()) {
1670 auto dc_p
= tmp
.crush
->get_parent_of_type(i
, 8 /* dc */, rno
);
1671 auto rack_p
= tmp
.crush
->get_parent_of_type(i
, 3 /* rack */, rno
);
1672 if (dc_p
== dc_parent
&&
1673 rack_parents
.find(rack_p
) == rack_parents
.end()) {
1679 ASSERT_TRUE(to
>= 0);
1680 ASSERT_TRUE(from
!= to
);
1681 std::cout
<< "from " << from
<< " to " << to
<< std::endl
;
1682 new_pg_upmap_items
.push_back(make_pair(from
, to
));
1684 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1685 pending_inc
.new_pg_upmap_items
[rep_pgid
] =
1686 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1687 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1688 tmp
.apply_incremental(pending_inc
);
1689 ASSERT_TRUE(tmp
.have_pg_upmaps(rep_pgid
));
1692 // *maybe_remove_pg_upmaps* should remove the above upmap_item
1693 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1694 clean_pg_upmaps(g_ceph_context
, tmp
, pending_inc
);
1695 tmp
.apply_incremental(pending_inc
);
1696 ASSERT_FALSE(tmp
.have_pg_upmaps(rep_pgid
));
1697 ASSERT_FALSE(tmp
.have_pg_upmaps(rep_pgid2
));
1702 TEST(PGTempMap
, basic
)
1706 for (auto i
=3; i
<1000; ++i
) {
1708 m
.set(x
, {static_cast<int>(i
)});
1712 ASSERT_NE(m
.find(a
), m
.end());
1713 ASSERT_EQ(m
.find(a
), m
.begin());
1714 ASSERT_EQ(m
.find(b
), m
.end());
1715 ASSERT_EQ(998u, m
.size());
1718 TEST_F(OSDMapTest
, BUG_43124
) {
1721 // https://tracker.ceph.com/issues/43124
1723 // build a temporary crush topology of 5racks,
1724 // 4 hosts per rack, 10osds per host
1725 OSDMap tmp
; // use a tmpmap here, so we do not dirty origin map..
1726 tmp
.deepish_copy_from(osdmap
);
1727 const int expected_host_num
= 20;
1728 int osd_per_host
= (int)get_num_osds() / expected_host_num
;
1729 ASSERT_GE(osd_per_host
, 10);
1730 int host_per_rack
= 4;
1733 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
1734 if (i
&& i
% osd_per_host
== 0) {
1737 if (i
&& i
% (host_per_rack
* osd_per_host
) == 0) {
1740 stringstream osd_name
;
1741 stringstream host_name
;
1742 stringstream rack_name
;
1743 vector
<string
> move_to
;
1744 osd_name
<< "osd." << i
;
1745 host_name
<< "host-" << index
;
1746 rack_name
<< "rack-" << rack_index
;
1747 move_to
.push_back("root=default");
1748 string rack_loc
= "rack=" + rack_name
.str();
1749 move_to
.push_back(rack_loc
);
1750 string host_loc
= "host=" + host_name
.str();
1751 move_to
.push_back(host_loc
);
1752 auto r
= crush_move(tmp
, osd_name
.str(), move_to
);
1758 get_crush(tmp
, crush
);
1759 string rule_name
= "rule_angel_1944";
1760 int rule_type
= pg_pool_t::TYPE_ERASURE
;
1761 ASSERT_TRUE(!crush
.rule_exists(rule_name
));
1763 for (rno
= 0; rno
< crush
.get_max_rules(); rno
++) {
1764 if (!crush
.rule_exists(rno
) && !crush
.ruleset_exists(rno
))
1770 string root_name
= "default";
1771 int root
= crush
.get_item_id(root_name
);
1772 crush_rule
*rule
= crush_make_rule(steps
, rno
, rule_type
, min_size
, max_size
);
1774 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSELEAF_TRIES
, 5, 0);
1775 crush_rule_set_step(rule
, step
++, CRUSH_RULE_SET_CHOOSE_TRIES
, 100, 0);
1776 crush_rule_set_step(rule
, step
++, CRUSH_RULE_TAKE
, root
, 0);
1777 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSE_FIRSTN
, 4, 3 /* rack */);
1778 crush_rule_set_step(rule
, step
++, CRUSH_RULE_CHOOSELEAF_INDEP
, 3, 1 /* host */);
1779 crush_rule_set_step(rule
, step
++, CRUSH_RULE_EMIT
, 0, 0);
1780 ASSERT_TRUE(step
== steps
);
1781 auto r
= crush_add_rule(crush
.get_crush_map(), rule
, rno
);
1782 ASSERT_TRUE(r
>= 0);
1783 crush
.set_rule_name(rno
, rule_name
);
1785 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1786 pending_inc
.crush
.clear();
1787 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1788 tmp
.apply_incremental(pending_inc
);
1792 crush
.dump_tree(&oss
, NULL
);
1793 std::cout
<< oss
.str() << std::endl
;
1794 Formatter
*f
= Formatter::create("json-pretty");
1795 f
->open_object_section("crush_rules");
1796 crush
.dump_rules(f
);
1801 // create a erasuce-coded pool referencing the above rule
1802 int64_t pool_angel_1944
;
1804 OSDMap::Incremental
new_pool_inc(tmp
.get_epoch() + 1);
1805 new_pool_inc
.new_pool_max
= tmp
.get_pool_max();
1806 new_pool_inc
.fsid
= tmp
.get_fsid();
1808 pool_angel_1944
= ++new_pool_inc
.new_pool_max
;
1809 pg_pool_t
*p
= new_pool_inc
.get_new_pool(pool_angel_1944
, &empty
);
1811 p
->set_pg_num(4096);
1812 p
->set_pgp_num(4096);
1813 p
->type
= pg_pool_t::TYPE_ERASURE
;
1814 p
->crush_rule
= rno
;
1815 p
->set_flag(pg_pool_t::FLAG_HASHPSPOOL
);
1816 new_pool_inc
.new_pool_names
[pool_angel_1944
] = "pool_angel_1944";
1817 tmp
.apply_incremental(new_pool_inc
);
1820 pg_t
rep_pg(0, pool_angel_1944
);
1821 pg_t rep_pgid
= tmp
.raw_pg_to_pg(rep_pg
);
1823 // insert a pg_upmap_item
1828 tmp
.pg_to_raw_up(rep_pgid
, &rep_up
, &rep_up_primary
);
1829 std::cout
<< "pgid " << rep_pgid
<< " up " << rep_up
<< std::endl
;
1830 ASSERT_TRUE(rep_up
.size() == 12);
1831 from
= *(rep_up
.begin());
1832 ASSERT_TRUE(from
>= 0);
1833 auto from_rack
= tmp
.crush
->get_parent_of_type(from
, 3 /* rack */, rno
);
1834 set
<int> failure_domains
;
1835 for (auto &osd
: rep_up
) {
1836 failure_domains
.insert(tmp
.crush
->get_parent_of_type(osd
, 1 /* host */, rno
));
1838 for (int i
= 0; i
< (int)get_num_osds(); i
++) {
1839 if (std::find(rep_up
.begin(), rep_up
.end(), i
) == rep_up
.end()) {
1840 auto to_rack
= tmp
.crush
->get_parent_of_type(i
, 3 /* rack */, rno
);
1841 auto to_host
= tmp
.crush
->get_parent_of_type(i
, 1 /* host */, rno
);
1842 if (to_rack
!= from_rack
&& failure_domains
.count(to_host
) == 0) {
1848 ASSERT_TRUE(to
>= 0);
1849 ASSERT_TRUE(from
!= to
);
1850 std::cout
<< "from " << from
<< " to " << to
<< std::endl
;
1851 vector
<pair
<int32_t,int32_t>> new_pg_upmap_items
;
1852 new_pg_upmap_items
.push_back(make_pair(from
, to
));
1853 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1854 pending_inc
.new_pg_upmap_items
[rep_pgid
] =
1855 mempool::osdmap::vector
<pair
<int32_t,int32_t>>(
1856 new_pg_upmap_items
.begin(), new_pg_upmap_items
.end());
1857 tmp
.apply_incremental(pending_inc
);
1858 ASSERT_TRUE(tmp
.have_pg_upmaps(rep_pgid
));
1861 // *maybe_remove_pg_upmaps* should not remove the above upmap_item
1862 OSDMap::Incremental
pending_inc(tmp
.get_epoch() + 1);
1863 clean_pg_upmaps(g_ceph_context
, tmp
, pending_inc
);
1864 tmp
.apply_incremental(pending_inc
);
1865 ASSERT_TRUE(tmp
.have_pg_upmaps(rep_pgid
));
1870 TEST_F(OSDMapTest
, BUG_48884
)
1875 unsigned int host_index
= 1;
1876 for (unsigned int x
=0; x
< get_num_osds();) {
1877 // Create three hosts with four osds each
1878 for (unsigned int y
=0; y
< 4; y
++) {
1879 stringstream osd_name
;
1880 stringstream host_name
;
1881 vector
<string
> move_to
;
1882 osd_name
<< "osd." << x
;
1883 host_name
<< "host-" << host_index
;
1884 move_to
.push_back("root=default");
1885 move_to
.push_back("rack=localrack");
1886 string host_loc
= "host=" + host_name
.str();
1887 move_to
.push_back(host_loc
);
1888 int r
= crush_move(osdmap
, osd_name
.str(), move_to
);
1896 get_crush(osdmap
, crush
);
1897 auto host_id
= crush
.get_item_id("localhost");
1898 crush
.remove_item(g_ceph_context
, host_id
, false);
1899 OSDMap::Incremental
pending_inc(osdmap
.get_epoch() + 1);
1900 pending_inc
.crush
.clear();
1901 crush
.encode(pending_inc
.crush
, CEPH_FEATURES_SUPPORTED_DEFAULT
);
1902 osdmap
.apply_incremental(pending_inc
);
1905 osd_stat_t stats
, stats_null
;
1906 stats
.statfs
.total
= 500000;
1907 stats
.statfs
.available
= 50000;
1908 stats
.statfs
.omap_allocated
= 50000;
1909 stats
.statfs
.internal_metadata
= 50000;
1910 stats_null
.statfs
.total
= 0;
1911 stats_null
.statfs
.available
= 0;
1912 stats_null
.statfs
.omap_allocated
= 0;
1913 stats_null
.statfs
.internal_metadata
= 0;
1914 for (unsigned int x
=0; x
< get_num_osds(); x
++) {
1915 if (x
> 3 && x
< 8) {
1916 pgmap
.osd_stat
.insert({x
,stats_null
});
1918 pgmap
.osd_stat
.insert({x
,stats
});
1923 boost::scoped_ptr
<Formatter
> f(Formatter::create("json-pretty"));
1924 print_osd_utilization(osdmap
, pgmap
, ss
, f
.get(), true, "root");
1926 parser
.parse(ss
.str().c_str(), static_cast<int>(ss
.str().size()));
1927 auto iter
= parser
.find_first();
1928 for (const auto& bucket
: (*iter
)->get_array_elements()) {
1930 parser2
.parse(bucket
.c_str(), static_cast<int>(bucket
.size()));
1931 auto* obj
= parser2
.find_obj("name");
1932 if (obj
->get_data_val().str
.compare("localrack") == 0) {
1933 obj
= parser2
.find_obj("kb");
1934 ASSERT_EQ(obj
->get_data_val().str
, "3904");
1935 obj
= parser2
.find_obj("kb_used");
1936 ASSERT_EQ(obj
->get_data_val().str
, "3512");
1937 obj
= parser2
.find_obj("kb_used_omap");
1938 ASSERT_EQ(obj
->get_data_val().str
, "384");
1939 obj
= parser2
.find_obj("kb_used_meta");
1940 ASSERT_EQ(obj
->get_data_val().str
, "384");
1941 obj
= parser2
.find_obj("kb_avail");
1942 ASSERT_EQ(obj
->get_data_val().str
, "384");