]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/osd/TestOSDMap.cc
bump version to 16.2.6-pve2
[ceph.git] / ceph / src / test / osd / TestOSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #include "gtest/gtest.h"
3 #include "osd/OSDMap.h"
4 #include "osd/OSDMapMapping.h"
5 #include "mon/OSDMonitor.h"
6 #include "mon/PGMap.h"
7
8 #include "global/global_context.h"
9 #include "global/global_init.h"
10 #include "common/common_init.h"
11 #include "common/ceph_argparse.h"
12 #include "common/ceph_json.h"
13
14 #include <iostream>
15
16 using namespace std;
17
18 int main(int argc, char **argv) {
19 map<string,string> defaults = {
20 // make sure we have 3 copies, or some tests won't work
21 { "osd_pool_default_size", "3" },
22 // our map is flat, so just try and split across OSDs, not hosts or whatever
23 { "osd_crush_chooseleaf_type", "0" },
24 };
25 std::vector<const char*> args(argv, argv+argc);
26 auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
27 CODE_ENVIRONMENT_UTILITY,
28 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
29 common_init_finish(g_ceph_context);
30 ::testing::InitGoogleTest(&argc, argv);
31 return RUN_ALL_TESTS();
32 }
33
34 class OSDMapTest : public testing::Test {
35 int num_osds = 6;
36 public:
37 OSDMap osdmap;
38 OSDMapMapping mapping;
39 const uint64_t my_ec_pool = 1;
40 const uint64_t my_rep_pool = 2;
41
42
43 OSDMapTest() {}
44
45 void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
46 num_osds = new_num_osds;
47 uuid_d fsid;
48 osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
49 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
50 pending_inc.fsid = osdmap.get_fsid();
51 entity_addrvec_t sample_addrs;
52 sample_addrs.v.push_back(entity_addr_t());
53 uuid_d sample_uuid;
54 for (int i = 0; i < num_osds; ++i) {
55 sample_uuid.generate_random();
56 sample_addrs.v[0].nonce = i;
57 pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW;
58 pending_inc.new_up_client[i] = sample_addrs;
59 pending_inc.new_up_cluster[i] = sample_addrs;
60 pending_inc.new_hb_back_up[i] = sample_addrs;
61 pending_inc.new_hb_front_up[i] = sample_addrs;
62 pending_inc.new_weight[i] = CEPH_OSD_IN;
63 pending_inc.new_uuid[i] = sample_uuid;
64 }
65 osdmap.apply_incremental(pending_inc);
66 if (no_default_pools) // do not create any default pool(s)
67 return;
68
69 // Create an EC ruleset and a pool using it
70 int r = osdmap.crush->add_simple_rule(
71 "erasure", "default", "osd", "",
72 "indep", pg_pool_t::TYPE_ERASURE,
73 &cerr);
74
75 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
76 new_pool_inc.new_pool_max = osdmap.get_pool_max();
77 new_pool_inc.fsid = osdmap.get_fsid();
78 pg_pool_t empty;
79 // make an ec pool
80 uint64_t pool_id = ++new_pool_inc.new_pool_max;
81 ceph_assert(pool_id == my_ec_pool);
82 pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
83 p->size = 3;
84 p->set_pg_num(64);
85 p->set_pgp_num(64);
86 p->type = pg_pool_t::TYPE_ERASURE;
87 p->crush_rule = r;
88 new_pool_inc.new_pool_names[pool_id] = "ec";
89 // and a replicated pool
90 pool_id = ++new_pool_inc.new_pool_max;
91 ceph_assert(pool_id == my_rep_pool);
92 p = new_pool_inc.get_new_pool(pool_id, &empty);
93 p->size = 3;
94 p->set_pg_num(64);
95 p->set_pgp_num(64);
96 p->type = pg_pool_t::TYPE_REPLICATED;
97 p->crush_rule = 0;
98 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
99 new_pool_inc.new_pool_names[pool_id] = "reppool";
100 osdmap.apply_incremental(new_pool_inc);
101 }
102 unsigned int get_num_osds() { return num_osds; }
103 void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
104 bufferlist bl;
105 tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
106 auto p = bl.cbegin();
107 newcrush.decode(p);
108 }
109 int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
110 map<string,string> loc;
111 CrushWrapper::parse_loc_map(argvec, &loc);
112 CrushWrapper newcrush;
113 get_crush(tmap, newcrush);
114 if (!newcrush.name_exists(name)) {
115 return -ENOENT;
116 }
117 int id = newcrush.get_item_id(name);
118 int err;
119 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
120 if (id >= 0) {
121 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
122 } else {
123 err = newcrush.move_bucket(g_ceph_context, id, loc);
124 }
125 if (err >= 0) {
126 OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
127 pending_inc.crush.clear();
128 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
129 tmap.apply_incremental(pending_inc);
130 err = 0;
131 }
132 } else {
133 // already there
134 err = 0;
135 }
136 return err;
137 }
138 int crush_rule_create_replicated(const string &name,
139 const string &root,
140 const string &type) {
141 if (osdmap.crush->rule_exists(name)) {
142 return osdmap.crush->get_rule_id(name);
143 }
144 CrushWrapper newcrush;
145 get_crush(osdmap, newcrush);
146 string device_class;
147 stringstream ss;
148 int ruleno = newcrush.add_simple_rule(
149 name, root, type, device_class,
150 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
151 if (ruleno >= 0) {
152 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
153 pending_inc.crush.clear();
154 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
155 osdmap.apply_incremental(pending_inc);
156 }
157 return ruleno;
158 }
159 void test_mappings(int pool,
160 int num,
161 vector<int> *any,
162 vector<int> *first,
163 vector<int> *primary) {
164 mapping.update(osdmap);
165 for (int i=0; i<num; ++i) {
166 vector<int> up, acting;
167 int up_primary, acting_primary;
168 pg_t pgid(i, pool);
169 osdmap.pg_to_up_acting_osds(pgid,
170 &up, &up_primary, &acting, &acting_primary);
171 for (unsigned j=0; j<acting.size(); ++j)
172 (*any)[acting[j]]++;
173 if (!acting.empty())
174 (*first)[acting[0]]++;
175 if (acting_primary >= 0)
176 (*primary)[acting_primary]++;
177
178 // compare to precalc mapping
179 vector<int> up2, acting2;
180 int up_primary2, acting_primary2;
181 pgid = osdmap.raw_pg_to_pg(pgid);
182 mapping.get(pgid, &up2, &up_primary2, &acting2, &acting_primary2);
183 ASSERT_EQ(up, up2);
184 ASSERT_EQ(up_primary, up_primary2);
185 ASSERT_EQ(acting, acting2);
186 ASSERT_EQ(acting_primary, acting_primary2);
187 }
188 cout << "any: " << *any << std::endl;;
189 cout << "first: " << *first << std::endl;;
190 cout << "primary: " << *primary << std::endl;;
191 }
192 void clean_pg_upmaps(CephContext *cct,
193 const OSDMap& om,
194 OSDMap::Incremental& pending_inc) {
195 int cpu_num = 8;
196 int pgs_per_chunk = 256;
197 ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num);
198 tp.start();
199 ParallelPGMapper mapper(cct, &tp);
200 vector<pg_t> pgs_to_check;
201 om.get_upmap_pgs(&pgs_to_check);
202 OSDMonitor::CleanUpmapJob job(cct, om, pending_inc);
203 mapper.queue(&job, pgs_per_chunk, pgs_to_check);
204 job.wait();
205 tp.stop();
206 }
207 };
208
209 TEST_F(OSDMapTest, Create) {
210 set_up_map();
211 ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
212 ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
213 }
214
215 TEST_F(OSDMapTest, Features) {
216 // with EC pool
217 set_up_map();
218 uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
219 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
220 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
221 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
222 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
223 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
224 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
225
226 // clients have a slightly different view
227 features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
228 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
229 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
230 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
231 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
232 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
233 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
234
235 // remove teh EC pool, but leave the rule. add primary affinity.
236 {
237 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
238 new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
239 new_pool_inc.new_primary_affinity[0] = 0x8000;
240 osdmap.apply_incremental(new_pool_inc);
241 }
242
243 features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
244 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
245 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
246 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
247 ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
248 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
249 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
250
251 // FIXME: test tiering feature bits
252 }
253
254 TEST_F(OSDMapTest, MapPG) {
255 set_up_map();
256
257 std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl;
258 pg_t rawpg(0, my_rep_pool);
259 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
260 vector<int> up_osds, acting_osds;
261 int up_primary, acting_primary;
262
263 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
264 &acting_osds, &acting_primary);
265
266 vector<int> old_up_osds, old_acting_osds;
267 osdmap.pg_to_up_acting_osds(pgid, old_up_osds, old_acting_osds);
268 ASSERT_EQ(old_up_osds, up_osds);
269 ASSERT_EQ(old_acting_osds, acting_osds);
270
271 ASSERT_EQ(osdmap.get_pg_pool(my_rep_pool)->get_size(), up_osds.size());
272 }
273
274 TEST_F(OSDMapTest, MapFunctionsMatch) {
275 // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match
276 set_up_map();
277 pg_t rawpg(0, my_rep_pool);
278 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
279 vector<int> up_osds, acting_osds;
280 int up_primary, acting_primary;
281
282 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
283 &acting_osds, &acting_primary);
284
285 vector<int> up_osds_two, acting_osds_two;
286
287 osdmap.pg_to_up_acting_osds(pgid, up_osds_two, acting_osds_two);
288
289 ASSERT_EQ(up_osds, up_osds_two);
290 ASSERT_EQ(acting_osds, acting_osds_two);
291
292 int acting_primary_two;
293 osdmap.pg_to_acting_osds(pgid, &acting_osds_two, &acting_primary_two);
294 EXPECT_EQ(acting_osds, acting_osds_two);
295 EXPECT_EQ(acting_primary, acting_primary_two);
296 osdmap.pg_to_acting_osds(pgid, acting_osds_two);
297 EXPECT_EQ(acting_osds, acting_osds_two);
298 }
299
300 /** This test must be removed or modified appropriately when we allow
301 * other ways to specify a primary. */
302 TEST_F(OSDMapTest, PrimaryIsFirst) {
303 set_up_map();
304
305 pg_t rawpg(0, my_rep_pool);
306 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
307 vector<int> up_osds, acting_osds;
308 int up_primary, acting_primary;
309
310 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
311 &acting_osds, &acting_primary);
312 EXPECT_EQ(up_osds[0], up_primary);
313 EXPECT_EQ(acting_osds[0], acting_primary);
314 }
315
316 TEST_F(OSDMapTest, PGTempRespected) {
317 set_up_map();
318
319 pg_t rawpg(0, my_rep_pool);
320 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
321 vector<int> up_osds, acting_osds;
322 int up_primary, acting_primary;
323
324 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
325 &acting_osds, &acting_primary);
326
327 // copy and swap first and last element in acting_osds
328 vector<int> new_acting_osds(acting_osds);
329 int first = new_acting_osds[0];
330 new_acting_osds[0] = *new_acting_osds.rbegin();
331 *new_acting_osds.rbegin() = first;
332
333 // apply pg_temp to osdmap
334 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
335 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
336 new_acting_osds.begin(), new_acting_osds.end());
337 osdmap.apply_incremental(pgtemp_map);
338
339 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
340 &acting_osds, &acting_primary);
341 EXPECT_EQ(new_acting_osds, acting_osds);
342 }
343
344 TEST_F(OSDMapTest, PrimaryTempRespected) {
345 set_up_map();
346
347 pg_t rawpg(0, my_rep_pool);
348 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
349 vector<int> up_osds;
350 vector<int> acting_osds;
351 int up_primary, acting_primary;
352
353 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
354 &acting_osds, &acting_primary);
355
356 // make second OSD primary via incremental
357 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
358 pgtemp_map.new_primary_temp[pgid] = acting_osds[1];
359 osdmap.apply_incremental(pgtemp_map);
360
361 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
362 &acting_osds, &acting_primary);
363 EXPECT_EQ(acting_primary, acting_osds[1]);
364 }
365
366 TEST_F(OSDMapTest, CleanTemps) {
367 set_up_map();
368
369 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
370 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 2);
371 pg_t pga = osdmap.raw_pg_to_pg(pg_t(0, my_rep_pool));
372 {
373 vector<int> up_osds, acting_osds;
374 int up_primary, acting_primary;
375 osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary,
376 &acting_osds, &acting_primary);
377 pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector<int>(
378 up_osds.begin(), up_osds.end());
379 pgtemp_map.new_primary_temp[pga] = up_primary;
380 }
381 pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, my_rep_pool));
382 {
383 vector<int> up_osds, acting_osds;
384 int up_primary, acting_primary;
385 osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary,
386 &acting_osds, &acting_primary);
387 pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector<int>(
388 up_osds.begin(), up_osds.end());
389 pending_inc.new_primary_temp[pgb] = up_primary;
390 }
391
392 osdmap.apply_incremental(pgtemp_map);
393
394 OSDMap tmpmap;
395 tmpmap.deepish_copy_from(osdmap);
396 tmpmap.apply_incremental(pending_inc);
397 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
398
399 EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) &&
400 pending_inc.new_pg_temp[pga].size() == 0);
401 EXPECT_EQ(-1, pending_inc.new_primary_temp[pga]);
402
403 EXPECT_TRUE(!pending_inc.new_pg_temp.count(pgb) &&
404 !pending_inc.new_primary_temp.count(pgb));
405 }
406
407 TEST_F(OSDMapTest, KeepsNecessaryTemps) {
408 set_up_map();
409
410 pg_t rawpg(0, my_rep_pool);
411 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
412 vector<int> up_osds, acting_osds;
413 int up_primary, acting_primary;
414
415 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
416 &acting_osds, &acting_primary);
417
418 // find unused OSD and stick it in there
419 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
420 // find an unused osd and put it in place of the first one
421 int i = 0;
422 for(; i != (int)get_num_osds(); ++i) {
423 bool in_use = false;
424 for (vector<int>::iterator osd_it = up_osds.begin();
425 osd_it != up_osds.end();
426 ++osd_it) {
427 if (i == *osd_it) {
428 in_use = true;
429 break;
430 }
431 }
432 if (!in_use) {
433 up_osds[1] = i;
434 break;
435 }
436 }
437 if (i == (int)get_num_osds())
438 FAIL() << "did not find unused OSD for temp mapping";
439
440 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
441 up_osds.begin(), up_osds.end());
442 pgtemp_map.new_primary_temp[pgid] = up_osds[1];
443 osdmap.apply_incremental(pgtemp_map);
444
445 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
446
447 OSDMap tmpmap;
448 tmpmap.deepish_copy_from(osdmap);
449 tmpmap.apply_incremental(pending_inc);
450 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
451 EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid));
452 EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid));
453 }
454
455 TEST_F(OSDMapTest, PrimaryAffinity) {
456 set_up_map();
457
458 int n = get_num_osds();
459 for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
460 p != osdmap.get_pools().end();
461 ++p) {
462 int pool = p->first;
463 int expect_primary = 10000 / n;
464 cout << "pool " << pool << " size " << (int)p->second.size
465 << " expect_primary " << expect_primary << std::endl;
466 {
467 vector<int> any(n, 0);
468 vector<int> first(n, 0);
469 vector<int> primary(n, 0);
470 test_mappings(pool, 10000, &any, &first, &primary);
471 for (int i=0; i<n; ++i) {
472 ASSERT_LT(0, any[i]);
473 ASSERT_LT(0, first[i]);
474 ASSERT_LT(0, primary[i]);
475 }
476 }
477
478 osdmap.set_primary_affinity(0, 0);
479 osdmap.set_primary_affinity(1, 0);
480 {
481 vector<int> any(n, 0);
482 vector<int> first(n, 0);
483 vector<int> primary(n, 0);
484 test_mappings(pool, 10000, &any, &first, &primary);
485 for (int i=0; i<n; ++i) {
486 ASSERT_LT(0, any[i]);
487 if (i >= 2) {
488 ASSERT_LT(0, first[i]);
489 ASSERT_LT(0, primary[i]);
490 } else {
491 if (p->second.is_replicated()) {
492 ASSERT_EQ(0, first[i]);
493 }
494 ASSERT_EQ(0, primary[i]);
495 }
496 }
497 }
498
499 osdmap.set_primary_affinity(0, 0x8000);
500 osdmap.set_primary_affinity(1, 0);
501 {
502 vector<int> any(n, 0);
503 vector<int> first(n, 0);
504 vector<int> primary(n, 0);
505 test_mappings(pool, 10000, &any, &first, &primary);
506 int expect = (10000 / (n-2)) / 2; // half weight
507 cout << "expect " << expect << std::endl;
508 for (int i=0; i<n; ++i) {
509 ASSERT_LT(0, any[i]);
510 if (i >= 2) {
511 ASSERT_LT(0, first[i]);
512 ASSERT_LT(0, primary[i]);
513 } else if (i == 1) {
514 if (p->second.is_replicated()) {
515 ASSERT_EQ(0, first[i]);
516 }
517 ASSERT_EQ(0, primary[i]);
518 } else {
519 ASSERT_LT(expect *2/3, primary[0]);
520 ASSERT_GT(expect *4/3, primary[0]);
521 }
522 }
523 }
524
525 osdmap.set_primary_affinity(0, 0x10000);
526 osdmap.set_primary_affinity(1, 0x10000);
527 }
528 }
529
530 TEST_F(OSDMapTest, get_osd_crush_node_flags) {
531 set_up_map();
532
533 for (unsigned i=0; i<get_num_osds(); ++i) {
534 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(i));
535 }
536
537 OSDMap::Incremental inc(osdmap.get_epoch() + 1);
538 inc.new_crush_node_flags[-1] = 123u;
539 osdmap.apply_incremental(inc);
540 for (unsigned i=0; i<get_num_osds(); ++i) {
541 ASSERT_EQ(123u, osdmap.get_osd_crush_node_flags(i));
542 }
543 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
544
545 OSDMap::Incremental inc3(osdmap.get_epoch() + 1);
546 inc3.new_crush_node_flags[-1] = 456u;
547 osdmap.apply_incremental(inc3);
548 for (unsigned i=0; i<get_num_osds(); ++i) {
549 ASSERT_EQ(456u, osdmap.get_osd_crush_node_flags(i));
550 }
551 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
552
553 OSDMap::Incremental inc2(osdmap.get_epoch() + 1);
554 inc2.new_crush_node_flags[-1] = 0;
555 osdmap.apply_incremental(inc2);
556 for (unsigned i=0; i<get_num_osds(); ++i) {
557 ASSERT_EQ(0u, osdmap.get_crush_node_flags(i));
558 }
559 }
560
561 TEST_F(OSDMapTest, parse_osd_id_list) {
562 set_up_map();
563 set<int> out;
564 set<int> all;
565 osdmap.get_all_osds(all);
566
567 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout));
568 ASSERT_EQ(1u, out.size());
569 ASSERT_EQ(0, *out.begin());
570
571 ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout));
572 ASSERT_EQ(1u, out.size());
573 ASSERT_EQ(1, *out.begin());
574
575 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout));
576 ASSERT_EQ(2u, out.size());
577 ASSERT_EQ(0, *out.begin());
578 ASSERT_EQ(1, *out.rbegin());
579
580 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout));
581 ASSERT_EQ(2u, out.size());
582 ASSERT_EQ(0, *out.begin());
583 ASSERT_EQ(1, *out.rbegin());
584
585 ASSERT_EQ(0, osdmap.parse_osd_id_list({"*"}, &out, &cout));
586 ASSERT_EQ(all.size(), out.size());
587 ASSERT_EQ(all, out);
588
589 ASSERT_EQ(0, osdmap.parse_osd_id_list({"all"}, &out, &cout));
590 ASSERT_EQ(all, out);
591
592 ASSERT_EQ(0, osdmap.parse_osd_id_list({"any"}, &out, &cout));
593 ASSERT_EQ(all, out);
594
595 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"foo"}, &out, &cout));
596 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
597 }
598
599 TEST_F(OSDMapTest, CleanPGUpmaps) {
600 set_up_map();
601
602 // build a crush rule of type host
603 const int expected_host_num = 3;
604 int osd_per_host = get_num_osds() / expected_host_num;
605 ASSERT_GE(2, osd_per_host);
606 int index = 0;
607 for (int i = 0; i < (int)get_num_osds(); i++) {
608 if (i && i % osd_per_host == 0) {
609 ++index;
610 }
611 stringstream osd_name;
612 stringstream host_name;
613 vector<string> move_to;
614 osd_name << "osd." << i;
615 host_name << "host-" << index;
616 move_to.push_back("root=default");
617 string host_loc = "host=" + host_name.str();
618 move_to.push_back(host_loc);
619 int r = crush_move(osdmap, osd_name.str(), move_to);
620 ASSERT_EQ(0, r);
621 }
622 const string upmap_rule = "upmap";
623 int upmap_rule_no = crush_rule_create_replicated(
624 upmap_rule, "default", "host");
625 ASSERT_LT(0, upmap_rule_no);
626
627 // create a replicated pool which references the above rule
628 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
629 new_pool_inc.new_pool_max = osdmap.get_pool_max();
630 new_pool_inc.fsid = osdmap.get_fsid();
631 pg_pool_t empty;
632 uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
633 pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
634 p->size = 2;
635 p->set_pg_num(64);
636 p->set_pgp_num(64);
637 p->type = pg_pool_t::TYPE_REPLICATED;
638 p->crush_rule = upmap_rule_no;
639 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
640 new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
641 osdmap.apply_incremental(new_pool_inc);
642
643 pg_t rawpg(0, upmap_pool_id);
644 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
645 vector<int> up;
646 int up_primary;
647 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
648 ASSERT_LT(1U, up.size());
649 {
650 // validate we won't have two OSDs from a same host
651 int parent_0 = osdmap.crush->get_parent_of_type(up[0],
652 osdmap.crush->get_type_id("host"));
653 int parent_1 = osdmap.crush->get_parent_of_type(up[1],
654 osdmap.crush->get_type_id("host"));
655 ASSERT_TRUE(parent_0 != parent_1);
656 }
657
658 {
659 // cancel stale upmaps
660 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
661 int from = -1;
662 for (int i = 0; i < (int)get_num_osds(); i++) {
663 if (std::find(up.begin(), up.end(), i) == up.end()) {
664 from = i;
665 break;
666 }
667 }
668 ASSERT_TRUE(from >= 0);
669 int to = -1;
670 for (int i = 0; i < (int)get_num_osds(); i++) {
671 if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
672 to = i;
673 break;
674 }
675 }
676 ASSERT_TRUE(to >= 0);
677 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
678 new_pg_upmap_items.push_back(make_pair(from, to));
679 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
680 pending_inc.new_pg_upmap_items[pgid] =
681 mempool::osdmap::vector<pair<int32_t,int32_t>>(
682 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
683 OSDMap nextmap;
684 nextmap.deepish_copy_from(osdmap);
685 nextmap.apply_incremental(pending_inc);
686 ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
687 OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
688 clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc);
689 nextmap.apply_incremental(new_pending_inc);
690 ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
691 }
692
693 {
694 // https://tracker.ceph.com/issues/37493
695 pg_t ec_pg(0, my_ec_pool);
696 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
697 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
698 int from = -1;
699 int to = -1;
700 {
701 // insert a valid pg_upmap_item
702 vector<int> ec_up;
703 int ec_up_primary;
704 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
705 ASSERT_TRUE(!ec_up.empty());
706 from = *(ec_up.begin());
707 ASSERT_TRUE(from >= 0);
708 for (int i = 0; i < (int)get_num_osds(); i++) {
709 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
710 to = i;
711 break;
712 }
713 }
714 ASSERT_TRUE(to >= 0);
715 ASSERT_TRUE(from != to);
716 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
717 new_pg_upmap_items.push_back(make_pair(from, to));
718 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
719 pending_inc.new_pg_upmap_items[ec_pgid] =
720 mempool::osdmap::vector<pair<int32_t,int32_t>>(
721 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
722 tmpmap.deepish_copy_from(osdmap);
723 tmpmap.apply_incremental(pending_inc);
724 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
725 }
726 {
727 // mark one of the target OSDs of the above pg_upmap_item as down
728 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
729 pending_inc.new_state[to] = CEPH_OSD_UP;
730 tmpmap.apply_incremental(pending_inc);
731 ASSERT_TRUE(!tmpmap.is_up(to));
732 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
733 }
734 {
735 // confirm *clean_pg_upmaps* won't do anything bad
736 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
737 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
738 tmpmap.apply_incremental(pending_inc);
739 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
740 }
741 }
742
743 {
744 // http://tracker.ceph.com/issues/37501
745 pg_t ec_pg(0, my_ec_pool);
746 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
747 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
748 int from = -1;
749 int to = -1;
750 {
751 // insert a valid pg_upmap_item
752 vector<int> ec_up;
753 int ec_up_primary;
754 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
755 ASSERT_TRUE(!ec_up.empty());
756 from = *(ec_up.begin());
757 ASSERT_TRUE(from >= 0);
758 for (int i = 0; i < (int)get_num_osds(); i++) {
759 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
760 to = i;
761 break;
762 }
763 }
764 ASSERT_TRUE(to >= 0);
765 ASSERT_TRUE(from != to);
766 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
767 new_pg_upmap_items.push_back(make_pair(from, to));
768 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
769 pending_inc.new_pg_upmap_items[ec_pgid] =
770 mempool::osdmap::vector<pair<int32_t,int32_t>>(
771 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
772 tmpmap.deepish_copy_from(osdmap);
773 tmpmap.apply_incremental(pending_inc);
774 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
775 }
776 {
777 // mark one of the target OSDs of the above pg_upmap_item as out
778 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
779 pending_inc.new_weight[to] = CEPH_OSD_OUT;
780 tmpmap.apply_incremental(pending_inc);
781 ASSERT_TRUE(tmpmap.is_out(to));
782 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
783 }
784 {
785 // *clean_pg_upmaps* should be able to remove the above *bad* mapping
786 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
787 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
788 tmpmap.apply_incremental(pending_inc);
789 ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
790 }
791 }
792
793 {
794 // http://tracker.ceph.com/issues/37968
795
796 // build a temporary crush topology of 2 hosts, 3 osds per host
797 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
798 tmp.deepish_copy_from(osdmap);
799 const int expected_host_num = 2;
800 int osd_per_host = get_num_osds() / expected_host_num;
801 ASSERT_GE(osd_per_host, 3);
802 int index = 0;
803 for (int i = 0; i < (int)get_num_osds(); i++) {
804 if (i && i % osd_per_host == 0) {
805 ++index;
806 }
807 stringstream osd_name;
808 stringstream host_name;
809 vector<string> move_to;
810 osd_name << "osd." << i;
811 host_name << "host-" << index;
812 move_to.push_back("root=default");
813 string host_loc = "host=" + host_name.str();
814 move_to.push_back(host_loc);
815 auto r = crush_move(tmp, osd_name.str(), move_to);
816 ASSERT_EQ(0, r);
817 }
818
819 // build crush rule
820 CrushWrapper crush;
821 get_crush(tmp, crush);
822 string rule_name = "rule_37968";
823 int rule_type = pg_pool_t::TYPE_ERASURE;
824 ASSERT_TRUE(!crush.rule_exists(rule_name));
825 int rno;
826 for (rno = 0; rno < crush.get_max_rules(); rno++) {
827 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
828 break;
829 }
830 string root_name = "default";
831 int root = crush.get_item_id(root_name);
832 int min_size = 3;
833 int max_size = 4;
834 int steps = 6;
835 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
836 int step = 0;
837 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
838 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
839 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
840 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/);
841 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */);
842 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
843 ASSERT_TRUE(step == steps);
844 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
845 ASSERT_TRUE(r >= 0);
846 crush.set_rule_name(rno, rule_name);
847 {
848 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
849 pending_inc.crush.clear();
850 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
851 tmp.apply_incremental(pending_inc);
852 }
853
854 // create a erasuce-coded pool referencing the above rule
855 int64_t pool_37968;
856 {
857 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
858 new_pool_inc.new_pool_max = tmp.get_pool_max();
859 new_pool_inc.fsid = tmp.get_fsid();
860 pg_pool_t empty;
861 pool_37968 = ++new_pool_inc.new_pool_max;
862 pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
863 p->size = 4;
864 p->set_pg_num(8);
865 p->set_pgp_num(8);
866 p->type = pg_pool_t::TYPE_ERASURE;
867 p->crush_rule = rno;
868 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
869 new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
870 tmp.apply_incremental(new_pool_inc);
871 }
872
873 pg_t ec_pg(0, pool_37968);
874 pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
875 int from = -1;
876 int to = -1;
877 {
878 // insert a valid pg_upmap_item
879 vector<int> ec_up;
880 int ec_up_primary;
881 tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
882 ASSERT_TRUE(ec_up.size() == 4);
883 from = *(ec_up.begin());
884 ASSERT_TRUE(from >= 0);
885 auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
886 ASSERT_TRUE(parent < 0);
887 // pick an osd of the same parent with *from*
888 for (int i = 0; i < (int)get_num_osds(); i++) {
889 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
890 auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
891 if (p == parent) {
892 to = i;
893 break;
894 }
895 }
896 }
897 ASSERT_TRUE(to >= 0);
898 ASSERT_TRUE(from != to);
899 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
900 new_pg_upmap_items.push_back(make_pair(from, to));
901 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
902 pending_inc.new_pg_upmap_items[ec_pgid] =
903 mempool::osdmap::vector<pair<int32_t,int32_t>>(
904 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
905 tmp.apply_incremental(pending_inc);
906 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
907 }
908 {
909 // *clean_pg_upmaps* should not remove the above upmap_item
910 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
911 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
912 tmp.apply_incremental(pending_inc);
913 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
914 }
915 }
916
917 {
918 // TEST pg_upmap
919 {
920 // STEP-1: enumerate all children of up[0]'s parent,
921 // replace up[1] with one of them (other than up[0])
922 int parent = osdmap.crush->get_parent_of_type(up[0],
923 osdmap.crush->get_type_id("host"));
924 set<int> candidates;
925 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
926 ASSERT_LT(1U, candidates.size());
927 int replaced_by = -1;
928 for (auto c: candidates) {
929 if (c != up[0]) {
930 replaced_by = c;
931 break;
932 }
933 }
934 {
935 // Check we can handle a negative pg_upmap value
936 vector<int32_t> new_pg_upmap;
937 new_pg_upmap.push_back(up[0]);
938 new_pg_upmap.push_back(-823648512);
939 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
940 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
941 new_pg_upmap.begin(), new_pg_upmap.end());
942 osdmap.apply_incremental(pending_inc);
943 vector<int> new_up;
944 int new_up_primary;
945 // crucial call - _apply_upmap should ignore the negative value
946 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
947 }
948 ASSERT_NE(-1, replaced_by);
949 // generate a new pg_upmap item and apply
950 vector<int32_t> new_pg_upmap;
951 new_pg_upmap.push_back(up[0]);
952 new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
953 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
954 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
955 new_pg_upmap.begin(), new_pg_upmap.end());
956 osdmap.apply_incremental(pending_inc);
957 {
958 // validate pg_upmap is there
959 vector<int> new_up;
960 int new_up_primary;
961 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
962 ASSERT_EQ(new_up.size(), up.size());
963 ASSERT_EQ(new_up[0], new_pg_upmap[0]);
964 ASSERT_EQ(new_up[1], new_pg_upmap[1]);
965 // and we shall have two OSDs from a same host now..
966 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
967 osdmap.crush->get_type_id("host"));
968 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
969 osdmap.crush->get_type_id("host"));
970 ASSERT_EQ(parent_0, parent_1);
971 }
972 }
973 {
974 // STEP-2: apply cure
975 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
976 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
977 osdmap.apply_incremental(pending_inc);
978 {
979 // validate pg_upmap is gone (reverted)
980 vector<int> new_up;
981 int new_up_primary;
982 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
983 ASSERT_EQ(new_up, up);
984 ASSERT_EQ(new_up_primary, up_primary);
985 }
986 }
987 }
988
989 {
990 // TEST pg_upmap_items
991 // enumerate all used hosts first
992 set<int> parents;
993 for (auto u: up) {
994 int parent = osdmap.crush->get_parent_of_type(u,
995 osdmap.crush->get_type_id("host"));
996 ASSERT_GT(0, parent);
997 parents.insert(parent);
998 }
999 int candidate_parent = 0;
1000 set<int> candidate_children;
1001 vector<int> up_after_out;
1002 {
1003 // STEP-1: try mark out up[1] and all other OSDs from the same host
1004 int parent = osdmap.crush->get_parent_of_type(up[1],
1005 osdmap.crush->get_type_id("host"));
1006 set<int> children;
1007 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1008 &children);
1009 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1010 for (auto c: children) {
1011 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1012 }
1013 OSDMap tmpmap;
1014 tmpmap.deepish_copy_from(osdmap);
1015 tmpmap.apply_incremental(pending_inc);
1016 vector<int> new_up;
1017 int new_up_primary;
1018 tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1019 // verify that we'll have OSDs from a different host..
1020 int will_choose = -1;
1021 for (auto o: new_up) {
1022 int parent = tmpmap.crush->get_parent_of_type(o,
1023 osdmap.crush->get_type_id("host"));
1024 if (!parents.count(parent)) {
1025 will_choose = o;
1026 candidate_parent = parent; // record
1027 break;
1028 }
1029 }
1030 ASSERT_LT(-1, will_choose); // it is an OSD!
1031 ASSERT_NE(candidate_parent, 0);
1032 osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
1033 &candidate_children);
1034 ASSERT_TRUE(candidate_children.count(will_choose));
1035 candidate_children.erase(will_choose);
1036 ASSERT_FALSE(candidate_children.empty());
1037 up_after_out = new_up; // needed for verification..
1038 }
1039 {
1040 // Make sure we can handle a negative pg_upmap_item
1041 int victim = up[0];
1042 int replaced_by = -823648512;
1043 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1044 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1045 // apply
1046 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1047 pending_inc.new_pg_upmap_items[pgid] =
1048 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1049 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1050 osdmap.apply_incremental(pending_inc);
1051 vector<int> new_up;
1052 int new_up_primary;
1053 // crucial call - _apply_upmap should ignore the negative value
1054 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1055 }
1056 {
1057 // STEP-2: generating a new pg_upmap_items entry by
1058 // replacing up[0] with one coming from candidate_children
1059 int victim = up[0];
1060 int replaced_by = *candidate_children.begin();
1061 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1062 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1063 // apply
1064 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1065 pending_inc.new_pg_upmap_items[pgid] =
1066 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1067 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1068 osdmap.apply_incremental(pending_inc);
1069 {
1070 // validate pg_upmap_items is there
1071 vector<int> new_up;
1072 int new_up_primary;
1073 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1074 ASSERT_EQ(new_up.size(), up.size());
1075 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
1076 new_up.end());
1077 // and up[1] too
1078 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
1079 new_up.end());
1080 }
1081 }
1082 {
1083 // STEP-3: mark out up[1] and all other OSDs from the same host
1084 int parent = osdmap.crush->get_parent_of_type(up[1],
1085 osdmap.crush->get_type_id("host"));
1086 set<int> children;
1087 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1088 &children);
1089 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1090 for (auto c: children) {
1091 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1092 }
1093 osdmap.apply_incremental(pending_inc);
1094 {
1095 // validate we have two OSDs from the same host now..
1096 vector<int> new_up;
1097 int new_up_primary;
1098 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1099 ASSERT_EQ(up.size(), new_up.size());
1100 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
1101 osdmap.crush->get_type_id("host"));
1102 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
1103 osdmap.crush->get_type_id("host"));
1104 ASSERT_EQ(parent_0, parent_1);
1105 }
1106 }
1107 {
1108 // STEP-4: apply cure
1109 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1110 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1111 osdmap.apply_incremental(pending_inc);
1112 {
1113 // validate pg_upmap_items is gone (reverted)
1114 vector<int> new_up;
1115 int new_up_primary;
1116 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1117 ASSERT_EQ(new_up, up_after_out);
1118 }
1119 }
1120 }
1121 }
1122
1123 TEST_F(OSDMapTest, BUG_38897) {
1124 // http://tracker.ceph.com/issues/38897
1125 // build a fresh map with 12 OSDs, without any default pools
1126 set_up_map(12, true);
1127 const string pool_1("pool1");
1128 const string pool_2("pool2");
1129 int64_t pool_1_id = -1;
1130
1131 {
1132 // build customized crush rule for "pool1"
1133 string host_name = "host_for_pool_1";
1134 // build a customized host to capture osd.1~5
1135 for (int i = 1; i < 5; i++) {
1136 stringstream osd_name;
1137 vector<string> move_to;
1138 osd_name << "osd." << i;
1139 move_to.push_back("root=default");
1140 string host_loc = "host=" + host_name;
1141 move_to.push_back(host_loc);
1142 auto r = crush_move(osdmap, osd_name.str(), move_to);
1143 ASSERT_EQ(0, r);
1144 }
1145 CrushWrapper crush;
1146 get_crush(osdmap, crush);
1147 auto host_id = crush.get_item_id(host_name);
1148 ASSERT_TRUE(host_id < 0);
1149 string rule_name = "rule_for_pool1";
1150 int rule_type = pg_pool_t::TYPE_REPLICATED;
1151 ASSERT_TRUE(!crush.rule_exists(rule_name));
1152 int rno;
1153 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1154 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1155 break;
1156 }
1157 int min_size = 3;
1158 int max_size = 3;
1159 int steps = 7;
1160 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1161 int step = 0;
1162 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1163 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1164 // always choose osd.0
1165 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1166 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1167 // then pick any other random osds
1168 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1169 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1170 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1171 ASSERT_TRUE(step == steps);
1172 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1173 ASSERT_TRUE(r >= 0);
1174 crush.set_rule_name(rno, rule_name);
1175 {
1176 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1177 pending_inc.crush.clear();
1178 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1179 osdmap.apply_incremental(pending_inc);
1180 }
1181
1182 // create "pool1"
1183 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1184 pending_inc.new_pool_max = osdmap.get_pool_max();
1185 auto pool_id = ++pending_inc.new_pool_max;
1186 pool_1_id = pool_id;
1187 pg_pool_t empty;
1188 auto p = pending_inc.get_new_pool(pool_id, &empty);
1189 p->size = 3;
1190 p->min_size = 1;
1191 p->set_pg_num(3);
1192 p->set_pgp_num(3);
1193 p->type = pg_pool_t::TYPE_REPLICATED;
1194 p->crush_rule = rno;
1195 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1196 pending_inc.new_pool_names[pool_id] = pool_1;
1197 osdmap.apply_incremental(pending_inc);
1198 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1199 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
1200 {
1201 for (unsigned i = 0; i < 3; i++) {
1202 // 1.x -> [1]
1203 pg_t rawpg(i, pool_id);
1204 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1205 vector<int> up;
1206 int up_primary;
1207 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1208 ASSERT_TRUE(up.size() == 3);
1209 ASSERT_TRUE(up[0] == 0);
1210
1211 // insert a new pg_upmap
1212 vector<int32_t> new_up;
1213 // and remap 1.x to osd.1 only
1214 // this way osd.0 is deemed to be *underfull*
1215 // and osd.1 is deemed to be *overfull*
1216 new_up.push_back(1);
1217 {
1218 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1219 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1220 new_up.begin(), new_up.end());
1221 osdmap.apply_incremental(pending_inc);
1222 }
1223 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1224 ASSERT_TRUE(up.size() == 1);
1225 ASSERT_TRUE(up[0] == 1);
1226 }
1227 }
1228 }
1229
1230 {
1231 // build customized crush rule for "pool2"
1232 string host_name = "host_for_pool_2";
1233 // build a customized host to capture osd.6~11
1234 for (int i = 6; i < (int)get_num_osds(); i++) {
1235 stringstream osd_name;
1236 vector<string> move_to;
1237 osd_name << "osd." << i;
1238 move_to.push_back("root=default");
1239 string host_loc = "host=" + host_name;
1240 move_to.push_back(host_loc);
1241 auto r = crush_move(osdmap, osd_name.str(), move_to);
1242 ASSERT_EQ(0, r);
1243 }
1244 CrushWrapper crush;
1245 get_crush(osdmap, crush);
1246 auto host_id = crush.get_item_id(host_name);
1247 ASSERT_TRUE(host_id < 0);
1248 string rule_name = "rule_for_pool2";
1249 int rule_type = pg_pool_t::TYPE_REPLICATED;
1250 ASSERT_TRUE(!crush.rule_exists(rule_name));
1251 int rno;
1252 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1253 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1254 break;
1255 }
1256 int min_size = 3;
1257 int max_size = 3;
1258 int steps = 7;
1259 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1260 int step = 0;
1261 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1262 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1263 // always choose osd.0
1264 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1265 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1266 // then pick any other random osds
1267 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1268 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1269 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1270 ASSERT_TRUE(step == steps);
1271 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1272 ASSERT_TRUE(r >= 0);
1273 crush.set_rule_name(rno, rule_name);
1274 {
1275 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1276 pending_inc.crush.clear();
1277 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1278 osdmap.apply_incremental(pending_inc);
1279 }
1280
1281 // create "pool2"
1282 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1283 pending_inc.new_pool_max = osdmap.get_pool_max();
1284 auto pool_id = ++pending_inc.new_pool_max;
1285 pg_pool_t empty;
1286 auto p = pending_inc.get_new_pool(pool_id, &empty);
1287 p->size = 3;
1288 // include a single PG
1289 p->set_pg_num(1);
1290 p->set_pgp_num(1);
1291 p->type = pg_pool_t::TYPE_REPLICATED;
1292 p->crush_rule = rno;
1293 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1294 pending_inc.new_pool_names[pool_id] = pool_2;
1295 osdmap.apply_incremental(pending_inc);
1296 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1297 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
1298 pg_t rawpg(0, pool_id);
1299 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1300 EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
1301 vector<int> up;
1302 int up_primary;
1303 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1304 ASSERT_TRUE(up.size() == 3);
1305 ASSERT_TRUE(up[0] == 0);
1306
1307 {
1308 // build a pg_upmap_item that will
1309 // remap pg out from *underfull* osd.0
1310 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1311 new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
1312 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1313 pending_inc.new_pg_upmap_items[pgid] =
1314 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1315 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1316 osdmap.apply_incremental(pending_inc);
1317 ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
1318 vector<int> up;
1319 int up_primary;
1320 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1321 ASSERT_TRUE(up.size() == 3);
1322 ASSERT_TRUE(up[0] == 10);
1323 }
1324 }
1325
1326 // ready to go
1327 {
1328 set<int64_t> only_pools;
1329 ASSERT_TRUE(pool_1_id >= 0);
1330 only_pools.insert(pool_1_id);
1331 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1332 // require perfect distribution! (max deviation 0)
1333 osdmap.calc_pg_upmaps(g_ceph_context,
1334 0, // so we can force optimizing
1335 100,
1336 only_pools,
1337 &pending_inc);
1338 osdmap.apply_incremental(pending_inc);
1339 }
1340 }
1341
1342 TEST_F(OSDMapTest, BUG_40104) {
1343 // http://tracker.ceph.com/issues/40104
1344 int big_osd_num = 5000;
1345 int big_pg_num = 10000;
1346 set_up_map(big_osd_num, true);
1347 int pool_id;
1348 {
1349 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1350 pending_inc.new_pool_max = osdmap.get_pool_max();
1351 pool_id = ++pending_inc.new_pool_max;
1352 pg_pool_t empty;
1353 auto p = pending_inc.get_new_pool(pool_id, &empty);
1354 p->size = 3;
1355 p->min_size = 1;
1356 p->set_pg_num(big_pg_num);
1357 p->set_pgp_num(big_pg_num);
1358 p->type = pg_pool_t::TYPE_REPLICATED;
1359 p->crush_rule = 0;
1360 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1361 pending_inc.new_pool_names[pool_id] = "big_pool";
1362 osdmap.apply_incremental(pending_inc);
1363 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1364 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool");
1365 }
1366 {
1367 // generate pg_upmap_items for each pg
1368 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1369 for (int i = 0; i < big_pg_num; i++) {
1370 pg_t rawpg(i, pool_id);
1371 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1372 vector<int> up;
1373 int up_primary;
1374 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1375 ASSERT_TRUE(up.size() == 3);
1376 int victim = up[0];
1377 int replaced_by = random() % big_osd_num;
1378 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1379 // note that it might or might not be valid, we don't care
1380 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1381 pending_inc.new_pg_upmap_items[pgid] =
1382 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1383 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1384 }
1385 osdmap.apply_incremental(pending_inc);
1386 }
1387 {
1388 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1389 auto start = mono_clock::now();
1390 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1391 auto latency = mono_clock::now() - start;
1392 std::cout << "clean_pg_upmaps (~" << big_pg_num
1393 << " pg_upmap_items) latency:" << timespan_str(latency)
1394 << std::endl;
1395 }
1396 }
1397
1398 TEST_F(OSDMapTest, BUG_42052) {
1399 // https://tracker.ceph.com/issues/42052
1400 set_up_map(6, true);
1401 const string pool_name("pool");
1402 // build customized crush rule for "pool"
1403 CrushWrapper crush;
1404 get_crush(osdmap, crush);
1405 string rule_name = "rule";
1406 int rule_type = pg_pool_t::TYPE_REPLICATED;
1407 ASSERT_TRUE(!crush.rule_exists(rule_name));
1408 int rno;
1409 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1410 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1411 break;
1412 }
1413 int min_size = 3;
1414 int max_size = 3;
1415 int steps = 8;
1416 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1417 int step = 0;
1418 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1419 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1420 // always choose osd.0, osd.1, osd.2
1421 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1422 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1423 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 1);
1424 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1425 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 2);
1426 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1427 ASSERT_TRUE(step == steps);
1428 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1429 ASSERT_TRUE(r >= 0);
1430 crush.set_rule_name(rno, rule_name);
1431 {
1432 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1433 pending_inc.crush.clear();
1434 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1435 osdmap.apply_incremental(pending_inc);
1436 }
1437
1438 // create "pool"
1439 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1440 pending_inc.new_pool_max = osdmap.get_pool_max();
1441 auto pool_id = ++pending_inc.new_pool_max;
1442 pg_pool_t empty;
1443 auto p = pending_inc.get_new_pool(pool_id, &empty);
1444 p->size = 3;
1445 p->min_size = 1;
1446 p->set_pg_num(1);
1447 p->set_pgp_num(1);
1448 p->type = pg_pool_t::TYPE_REPLICATED;
1449 p->crush_rule = rno;
1450 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1451 pending_inc.new_pool_names[pool_id] = pool_name;
1452 osdmap.apply_incremental(pending_inc);
1453 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1454 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_name);
1455 pg_t rawpg(0, pool_id);
1456 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1457 {
1458 // pg_upmap 1.0 [2,3,5]
1459 vector<int32_t> new_up{2,3,5};
1460 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1461 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1462 new_up.begin(), new_up.end());
1463 osdmap.apply_incremental(pending_inc);
1464 }
1465 {
1466 // pg_upmap_items 1.0 [0,3,4,5]
1467 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1468 new_pg_upmap_items.push_back(make_pair(0, 3));
1469 new_pg_upmap_items.push_back(make_pair(4, 5));
1470 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1471 pending_inc.new_pg_upmap_items[pgid] =
1472 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1473 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1474 osdmap.apply_incremental(pending_inc);
1475 }
1476 {
1477 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1478 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1479 osdmap.apply_incremental(pending_inc);
1480 ASSERT_FALSE(osdmap.have_pg_upmaps(pgid));
1481 }
1482 }
1483
1484 TEST_F(OSDMapTest, BUG_42485) {
1485 set_up_map(60);
1486 {
1487 // build a temporary crush topology of 2datacenters, 3racks per dc,
1488 // 1host per rack, 10osds per host
1489 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
1490 tmp.deepish_copy_from(osdmap);
1491 const int expected_host_num = 6;
1492 int osd_per_host = (int)get_num_osds() / expected_host_num;
1493 ASSERT_GE(osd_per_host, 10);
1494 int host_per_dc = 3;
1495 int index = 0;
1496 int dc_index = 0;
1497 for (int i = 0; i < (int)get_num_osds(); i++) {
1498 if (i && i % osd_per_host == 0) {
1499 ++index;
1500 }
1501 if (i && i % (host_per_dc * osd_per_host) == 0) {
1502 ++dc_index;
1503 }
1504 stringstream osd_name;
1505 stringstream host_name;
1506 stringstream rack_name;
1507 stringstream dc_name;
1508 vector<string> move_to;
1509 osd_name << "osd." << i;
1510 host_name << "host-" << index;
1511 rack_name << "rack-" << index;
1512 dc_name << "dc-" << dc_index;
1513 move_to.push_back("root=default");
1514 string dc_loc = "datacenter=" + dc_name.str();
1515 move_to.push_back(dc_loc);
1516 string rack_loc = "rack=" + rack_name.str();
1517 move_to.push_back(rack_loc);
1518 string host_loc = "host=" + host_name.str();
1519 move_to.push_back(host_loc);
1520 auto r = crush_move(tmp, osd_name.str(), move_to);
1521 ASSERT_EQ(0, r);
1522 }
1523
1524 // build crush rule
1525 CrushWrapper crush;
1526 get_crush(tmp, crush);
1527 string rule_name = "rule_xeus_993_1";
1528 int rule_type = pg_pool_t::TYPE_REPLICATED;
1529 ASSERT_TRUE(!crush.rule_exists(rule_name));
1530 int rno;
1531 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1532 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1533 break;
1534 }
1535 string root_name = "default";
1536 string dc_1 = "dc-0";
1537 int dc1 = crush.get_item_id(dc_1);
1538 string dc_2 = "dc-1";
1539 int dc2 = crush.get_item_id(dc_2);
1540 int min_size = 1;
1541 int max_size = 20;
1542 int steps = 8;
1543 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1544 int step = 0;
1545 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1546 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1547 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc1, 0);
1548 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */);
1549 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1550 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc2, 0);
1551 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */);
1552 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1553 ASSERT_TRUE(step == steps);
1554 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1555 ASSERT_TRUE(r >= 0);
1556 crush.set_rule_name(rno, rule_name);
1557 {
1558 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1559 pending_inc.crush.clear();
1560 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1561 tmp.apply_incremental(pending_inc);
1562 }
1563 // create a repliacted pool referencing the above rule
1564 int64_t pool_xeus_993;
1565 {
1566 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
1567 new_pool_inc.new_pool_max = tmp.get_pool_max();
1568 new_pool_inc.fsid = tmp.get_fsid();
1569 pg_pool_t empty;
1570 pool_xeus_993 = ++new_pool_inc.new_pool_max;
1571 pg_pool_t *p = new_pool_inc.get_new_pool(pool_xeus_993, &empty);
1572 p->size = 4;
1573 p->set_pg_num(4096);
1574 p->set_pgp_num(4096);
1575 p->type = pg_pool_t::TYPE_REPLICATED;
1576 p->crush_rule = rno;
1577 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1578 new_pool_inc.new_pool_names[pool_xeus_993] = "pool_xeus_993";
1579 tmp.apply_incremental(new_pool_inc);
1580 }
1581
1582 pg_t rep_pg(0, pool_xeus_993);
1583 pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg);
1584 {
1585 int from = -1;
1586 int to = -1;
1587 vector<int> rep_up;
1588 int rep_up_primary;
1589 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1590 std::cout << "pgid " << rep_up << " up " << rep_up << std::endl;
1591 ASSERT_TRUE(rep_up.size() == 4);
1592 from = *(rep_up.begin());
1593 ASSERT_TRUE(from >= 0);
1594 auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno);
1595 if (dc_parent == dc1)
1596 dc_parent = dc2;
1597 else
1598 dc_parent = dc1;
1599 auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1600 ASSERT_TRUE(dc_parent < 0);
1601 ASSERT_TRUE(rack_parent < 0);
1602 set<int> rack_parents;
1603 for (auto &i: rep_up) {
1604 if (i == from) continue;
1605 auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1606 rack_parents.insert(rack_parent);
1607 }
1608 for (int i = 0; i < (int)get_num_osds(); i++) {
1609 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1610 auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno);
1611 auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1612 if (dc_p == dc_parent &&
1613 rack_parents.find(rack_p) == rack_parents.end()) {
1614 to = i;
1615 break;
1616 }
1617 }
1618 }
1619 ASSERT_TRUE(to >= 0);
1620 ASSERT_TRUE(from != to);
1621 std::cout << "from " << from << " to " << to << std::endl;
1622 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1623 new_pg_upmap_items.push_back(make_pair(from, to));
1624 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1625 pending_inc.new_pg_upmap_items[rep_pgid] =
1626 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1627 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1628 tmp.apply_incremental(pending_inc);
1629 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1630 }
1631 pg_t rep_pg2(2, pool_xeus_993);
1632 pg_t rep_pgid2 = tmp.raw_pg_to_pg(rep_pg2);
1633 {
1634 pg_t rep_pgid = rep_pgid2;
1635 vector<int> from_osds{-1, -1};
1636 vector<int> rep_up;
1637 int rep_up_primary;
1638 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1639 ASSERT_TRUE(rep_up.size() == 4);
1640 from_osds[0] = *(rep_up.begin());
1641 from_osds[1] = *(rep_up.rbegin());
1642 std::cout << "pgid " << rep_pgid2 << " up " << rep_up << std::endl;
1643 ASSERT_TRUE(*(from_osds.begin()) >= 0);
1644 ASSERT_TRUE(*(from_osds.rbegin()) >= 0);
1645 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1646 for (auto &from: from_osds) {
1647 int to = -1;
1648 auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno);
1649 if (dc_parent == dc1)
1650 dc_parent = dc2;
1651 else
1652 dc_parent = dc1;
1653 auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1654 ASSERT_TRUE(dc_parent < 0);
1655 ASSERT_TRUE(rack_parent < 0);
1656 set<int> rack_parents;
1657 for (auto &i: rep_up) {
1658 if (i == from) continue;
1659 auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1660 rack_parents.insert(rack_parent);
1661 }
1662 for (auto &i: new_pg_upmap_items) {
1663 auto rack_from = tmp.crush->get_parent_of_type(i.first, 3, rno);
1664 auto rack_to = tmp.crush->get_parent_of_type(i.second, 3, rno);
1665 rack_parents.insert(rack_from);
1666 rack_parents.insert(rack_to);
1667 }
1668 for (int i = 0; i < (int)get_num_osds(); i++) {
1669 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1670 auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno);
1671 auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1672 if (dc_p == dc_parent &&
1673 rack_parents.find(rack_p) == rack_parents.end()) {
1674 to = i;
1675 break;
1676 }
1677 }
1678 }
1679 ASSERT_TRUE(to >= 0);
1680 ASSERT_TRUE(from != to);
1681 std::cout << "from " << from << " to " << to << std::endl;
1682 new_pg_upmap_items.push_back(make_pair(from, to));
1683 }
1684 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1685 pending_inc.new_pg_upmap_items[rep_pgid] =
1686 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1687 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1688 tmp.apply_incremental(pending_inc);
1689 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1690 }
1691 {
1692 // *maybe_remove_pg_upmaps* should remove the above upmap_item
1693 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1694 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
1695 tmp.apply_incremental(pending_inc);
1696 ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid));
1697 ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid2));
1698 }
1699 }
1700 }
1701
1702 TEST(PGTempMap, basic)
1703 {
1704 PGTempMap m;
1705 pg_t a(1,1);
1706 for (auto i=3; i<1000; ++i) {
1707 pg_t x(i, 1);
1708 m.set(x, {static_cast<int>(i)});
1709 }
1710 pg_t b(2,1);
1711 m.set(a, {1, 2});
1712 ASSERT_NE(m.find(a), m.end());
1713 ASSERT_EQ(m.find(a), m.begin());
1714 ASSERT_EQ(m.find(b), m.end());
1715 ASSERT_EQ(998u, m.size());
1716 }
1717
1718 TEST_F(OSDMapTest, BUG_43124) {
1719 set_up_map(200);
1720 {
1721 // https://tracker.ceph.com/issues/43124
1722
1723 // build a temporary crush topology of 5racks,
1724 // 4 hosts per rack, 10osds per host
1725 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
1726 tmp.deepish_copy_from(osdmap);
1727 const int expected_host_num = 20;
1728 int osd_per_host = (int)get_num_osds() / expected_host_num;
1729 ASSERT_GE(osd_per_host, 10);
1730 int host_per_rack = 4;
1731 int index = 0;
1732 int rack_index = 0;
1733 for (int i = 0; i < (int)get_num_osds(); i++) {
1734 if (i && i % osd_per_host == 0) {
1735 ++index;
1736 }
1737 if (i && i % (host_per_rack * osd_per_host) == 0) {
1738 ++rack_index;
1739 }
1740 stringstream osd_name;
1741 stringstream host_name;
1742 stringstream rack_name;
1743 vector<string> move_to;
1744 osd_name << "osd." << i;
1745 host_name << "host-" << index;
1746 rack_name << "rack-" << rack_index;
1747 move_to.push_back("root=default");
1748 string rack_loc = "rack=" + rack_name.str();
1749 move_to.push_back(rack_loc);
1750 string host_loc = "host=" + host_name.str();
1751 move_to.push_back(host_loc);
1752 auto r = crush_move(tmp, osd_name.str(), move_to);
1753 ASSERT_EQ(0, r);
1754 }
1755
1756 // build crush rule
1757 CrushWrapper crush;
1758 get_crush(tmp, crush);
1759 string rule_name = "rule_angel_1944";
1760 int rule_type = pg_pool_t::TYPE_ERASURE;
1761 ASSERT_TRUE(!crush.rule_exists(rule_name));
1762 int rno;
1763 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1764 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1765 break;
1766 }
1767 int min_size = 1;
1768 int max_size = 20;
1769 int steps = 6;
1770 string root_name = "default";
1771 int root = crush.get_item_id(root_name);
1772 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1773 int step = 0;
1774 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1775 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1776 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1777 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_FIRSTN, 4, 3 /* rack */);
1778 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_INDEP, 3, 1 /* host */);
1779 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1780 ASSERT_TRUE(step == steps);
1781 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1782 ASSERT_TRUE(r >= 0);
1783 crush.set_rule_name(rno, rule_name);
1784 {
1785 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1786 pending_inc.crush.clear();
1787 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1788 tmp.apply_incremental(pending_inc);
1789 }
1790 {
1791 stringstream oss;
1792 crush.dump_tree(&oss, NULL);
1793 std::cout << oss.str() << std::endl;
1794 Formatter *f = Formatter::create("json-pretty");
1795 f->open_object_section("crush_rules");
1796 crush.dump_rules(f);
1797 f->close_section();
1798 f->flush(cout);
1799 delete f;
1800 }
1801 // create a erasuce-coded pool referencing the above rule
1802 int64_t pool_angel_1944;
1803 {
1804 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
1805 new_pool_inc.new_pool_max = tmp.get_pool_max();
1806 new_pool_inc.fsid = tmp.get_fsid();
1807 pg_pool_t empty;
1808 pool_angel_1944 = ++new_pool_inc.new_pool_max;
1809 pg_pool_t *p = new_pool_inc.get_new_pool(pool_angel_1944, &empty);
1810 p->size = 12;
1811 p->set_pg_num(4096);
1812 p->set_pgp_num(4096);
1813 p->type = pg_pool_t::TYPE_ERASURE;
1814 p->crush_rule = rno;
1815 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1816 new_pool_inc.new_pool_names[pool_angel_1944] = "pool_angel_1944";
1817 tmp.apply_incremental(new_pool_inc);
1818 }
1819
1820 pg_t rep_pg(0, pool_angel_1944);
1821 pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg);
1822 {
1823 // insert a pg_upmap_item
1824 int from = -1;
1825 int to = -1;
1826 vector<int> rep_up;
1827 int rep_up_primary;
1828 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1829 std::cout << "pgid " << rep_pgid << " up " << rep_up << std::endl;
1830 ASSERT_TRUE(rep_up.size() == 12);
1831 from = *(rep_up.begin());
1832 ASSERT_TRUE(from >= 0);
1833 auto from_rack = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1834 set<int> failure_domains;
1835 for (auto &osd : rep_up) {
1836 failure_domains.insert(tmp.crush->get_parent_of_type(osd, 1 /* host */, rno));
1837 }
1838 for (int i = 0; i < (int)get_num_osds(); i++) {
1839 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1840 auto to_rack = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1841 auto to_host = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
1842 if (to_rack != from_rack && failure_domains.count(to_host) == 0) {
1843 to = i;
1844 break;
1845 }
1846 }
1847 }
1848 ASSERT_TRUE(to >= 0);
1849 ASSERT_TRUE(from != to);
1850 std::cout << "from " << from << " to " << to << std::endl;
1851 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1852 new_pg_upmap_items.push_back(make_pair(from, to));
1853 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1854 pending_inc.new_pg_upmap_items[rep_pgid] =
1855 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1856 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1857 tmp.apply_incremental(pending_inc);
1858 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1859 }
1860 {
1861 // *maybe_remove_pg_upmaps* should not remove the above upmap_item
1862 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1863 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
1864 tmp.apply_incremental(pending_inc);
1865 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1866 }
1867 }
1868 }
1869
1870 TEST_F(OSDMapTest, BUG_48884)
1871 {
1872
1873 set_up_map(12);
1874
1875 unsigned int host_index = 1;
1876 for (unsigned int x=0; x < get_num_osds();) {
1877 // Create three hosts with four osds each
1878 for (unsigned int y=0; y < 4; y++) {
1879 stringstream osd_name;
1880 stringstream host_name;
1881 vector<string> move_to;
1882 osd_name << "osd." << x;
1883 host_name << "host-" << host_index;
1884 move_to.push_back("root=default");
1885 move_to.push_back("rack=localrack");
1886 string host_loc = "host=" + host_name.str();
1887 move_to.push_back(host_loc);
1888 int r = crush_move(osdmap, osd_name.str(), move_to);
1889 ASSERT_EQ(0, r);
1890 x++;
1891 }
1892 host_index++;
1893 }
1894
1895 CrushWrapper crush;
1896 get_crush(osdmap, crush);
1897 auto host_id = crush.get_item_id("localhost");
1898 crush.remove_item(g_ceph_context, host_id, false);
1899 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1900 pending_inc.crush.clear();
1901 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1902 osdmap.apply_incremental(pending_inc);
1903
1904 PGMap pgmap;
1905 osd_stat_t stats, stats_null;
1906 stats.statfs.total = 500000;
1907 stats.statfs.available = 50000;
1908 stats.statfs.omap_allocated = 50000;
1909 stats.statfs.internal_metadata = 50000;
1910 stats_null.statfs.total = 0;
1911 stats_null.statfs.available = 0;
1912 stats_null.statfs.omap_allocated = 0;
1913 stats_null.statfs.internal_metadata = 0;
1914 for (unsigned int x=0; x < get_num_osds(); x++) {
1915 if (x > 3 && x < 8) {
1916 pgmap.osd_stat.insert({x,stats_null});
1917 } else {
1918 pgmap.osd_stat.insert({x,stats});
1919 }
1920 }
1921
1922 stringstream ss;
1923 boost::scoped_ptr<Formatter> f(Formatter::create("json-pretty"));
1924 print_osd_utilization(osdmap, pgmap, ss, f.get(), true, "root");
1925 JSONParser parser;
1926 parser.parse(ss.str().c_str(), static_cast<int>(ss.str().size()));
1927 auto iter = parser.find_first();
1928 for (const auto& bucket : (*iter)->get_array_elements()) {
1929 JSONParser parser2;
1930 parser2.parse(bucket.c_str(), static_cast<int>(bucket.size()));
1931 auto* obj = parser2.find_obj("name");
1932 if (obj->get_data_val().str.compare("localrack") == 0) {
1933 obj = parser2.find_obj("kb");
1934 ASSERT_EQ(obj->get_data_val().str, "3904");
1935 obj = parser2.find_obj("kb_used");
1936 ASSERT_EQ(obj->get_data_val().str, "3512");
1937 obj = parser2.find_obj("kb_used_omap");
1938 ASSERT_EQ(obj->get_data_val().str, "384");
1939 obj = parser2.find_obj("kb_used_meta");
1940 ASSERT_EQ(obj->get_data_val().str, "384");
1941 obj = parser2.find_obj("kb_avail");
1942 ASSERT_EQ(obj->get_data_val().str, "384");
1943 }
1944 }
1945 }