]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/osd/TestOSDMap.cc
import ceph 14.2.5
[ceph.git] / ceph / src / test / osd / TestOSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #include "gtest/gtest.h"
3 #include "osd/OSDMap.h"
4 #include "osd/OSDMapMapping.h"
5 #include "mon/OSDMonitor.h"
6
7 #include "global/global_context.h"
8 #include "global/global_init.h"
9 #include "common/common_init.h"
10 #include "common/ceph_argparse.h"
11
12 #include <iostream>
13
14 using namespace std;
15
16 int main(int argc, char **argv) {
17 map<string,string> defaults = {
18 // make sure we have 3 copies, or some tests won't work
19 { "osd_pool_default_size", "3" },
20 // our map is flat, so just try and split across OSDs, not hosts or whatever
21 { "osd_crush_chooseleaf_type", "0" },
22 };
23 std::vector<const char*> args(argv, argv+argc);
24 auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
25 CODE_ENVIRONMENT_UTILITY,
26 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
27 common_init_finish(g_ceph_context);
28 ::testing::InitGoogleTest(&argc, argv);
29 return RUN_ALL_TESTS();
30 }
31
32 class OSDMapTest : public testing::Test {
33 int num_osds = 6;
34 public:
35 OSDMap osdmap;
36 OSDMapMapping mapping;
37 const uint64_t my_ec_pool = 1;
38 const uint64_t my_rep_pool = 2;
39
40
41 OSDMapTest() {}
42
43 void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
44 num_osds = new_num_osds;
45 uuid_d fsid;
46 osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
47 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
48 pending_inc.fsid = osdmap.get_fsid();
49 entity_addrvec_t sample_addrs;
50 sample_addrs.v.push_back(entity_addr_t());
51 uuid_d sample_uuid;
52 for (int i = 0; i < num_osds; ++i) {
53 sample_uuid.generate_random();
54 sample_addrs.v[0].nonce = i;
55 pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW;
56 pending_inc.new_up_client[i] = sample_addrs;
57 pending_inc.new_up_cluster[i] = sample_addrs;
58 pending_inc.new_hb_back_up[i] = sample_addrs;
59 pending_inc.new_hb_front_up[i] = sample_addrs;
60 pending_inc.new_weight[i] = CEPH_OSD_IN;
61 pending_inc.new_uuid[i] = sample_uuid;
62 }
63 osdmap.apply_incremental(pending_inc);
64 if (no_default_pools) // do not create any default pool(s)
65 return;
66
67 // Create an EC ruleset and a pool using it
68 int r = osdmap.crush->add_simple_rule(
69 "erasure", "default", "osd", "",
70 "indep", pg_pool_t::TYPE_ERASURE,
71 &cerr);
72
73 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
74 new_pool_inc.new_pool_max = osdmap.get_pool_max();
75 new_pool_inc.fsid = osdmap.get_fsid();
76 pg_pool_t empty;
77 // make an ec pool
78 uint64_t pool_id = ++new_pool_inc.new_pool_max;
79 ceph_assert(pool_id == my_ec_pool);
80 pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
81 p->size = 3;
82 p->set_pg_num(64);
83 p->set_pgp_num(64);
84 p->type = pg_pool_t::TYPE_ERASURE;
85 p->crush_rule = r;
86 new_pool_inc.new_pool_names[pool_id] = "ec";
87 // and a replicated pool
88 pool_id = ++new_pool_inc.new_pool_max;
89 ceph_assert(pool_id == my_rep_pool);
90 p = new_pool_inc.get_new_pool(pool_id, &empty);
91 p->size = 3;
92 p->set_pg_num(64);
93 p->set_pgp_num(64);
94 p->type = pg_pool_t::TYPE_REPLICATED;
95 p->crush_rule = 0;
96 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
97 new_pool_inc.new_pool_names[pool_id] = "reppool";
98 osdmap.apply_incremental(new_pool_inc);
99 }
100 unsigned int get_num_osds() { return num_osds; }
101 void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
102 bufferlist bl;
103 tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
104 auto p = bl.cbegin();
105 newcrush.decode(p);
106 }
107 int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
108 map<string,string> loc;
109 CrushWrapper::parse_loc_map(argvec, &loc);
110 CrushWrapper newcrush;
111 get_crush(tmap, newcrush);
112 if (!newcrush.name_exists(name)) {
113 return -ENOENT;
114 }
115 int id = newcrush.get_item_id(name);
116 int err;
117 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
118 if (id >= 0) {
119 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
120 } else {
121 err = newcrush.move_bucket(g_ceph_context, id, loc);
122 }
123 if (err >= 0) {
124 OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
125 pending_inc.crush.clear();
126 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
127 tmap.apply_incremental(pending_inc);
128 err = 0;
129 }
130 } else {
131 // already there
132 err = 0;
133 }
134 return err;
135 }
136 int crush_rule_create_replicated(const string &name,
137 const string &root,
138 const string &type) {
139 if (osdmap.crush->rule_exists(name)) {
140 return osdmap.crush->get_rule_id(name);
141 }
142 CrushWrapper newcrush;
143 get_crush(osdmap, newcrush);
144 string device_class;
145 stringstream ss;
146 int ruleno = newcrush.add_simple_rule(
147 name, root, type, device_class,
148 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
149 if (ruleno >= 0) {
150 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
151 pending_inc.crush.clear();
152 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
153 osdmap.apply_incremental(pending_inc);
154 }
155 return ruleno;
156 }
157 void test_mappings(int pool,
158 int num,
159 vector<int> *any,
160 vector<int> *first,
161 vector<int> *primary) {
162 mapping.update(osdmap);
163 for (int i=0; i<num; ++i) {
164 vector<int> up, acting;
165 int up_primary, acting_primary;
166 pg_t pgid(i, pool);
167 osdmap.pg_to_up_acting_osds(pgid,
168 &up, &up_primary, &acting, &acting_primary);
169 for (unsigned j=0; j<acting.size(); ++j)
170 (*any)[acting[j]]++;
171 if (!acting.empty())
172 (*first)[acting[0]]++;
173 if (acting_primary >= 0)
174 (*primary)[acting_primary]++;
175
176 // compare to precalc mapping
177 vector<int> up2, acting2;
178 int up_primary2, acting_primary2;
179 pgid = osdmap.raw_pg_to_pg(pgid);
180 mapping.get(pgid, &up2, &up_primary2, &acting2, &acting_primary2);
181 ASSERT_EQ(up, up2);
182 ASSERT_EQ(up_primary, up_primary2);
183 ASSERT_EQ(acting, acting2);
184 ASSERT_EQ(acting_primary, acting_primary2);
185 }
186 cout << "any: " << *any << std::endl;;
187 cout << "first: " << *first << std::endl;;
188 cout << "primary: " << *primary << std::endl;;
189 }
190 void clean_pg_upmaps(CephContext *cct,
191 const OSDMap& om,
192 OSDMap::Incremental& pending_inc) {
193 int cpu_num = 8;
194 int pgs_per_chunk = 256;
195 ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num);
196 tp.start();
197 ParallelPGMapper mapper(cct, &tp);
198 vector<pg_t> pgs_to_check;
199 om.get_upmap_pgs(&pgs_to_check);
200 OSDMonitor::CleanUpmapJob job(cct, om, pending_inc);
201 mapper.queue(&job, pgs_per_chunk, pgs_to_check);
202 job.wait();
203 tp.stop();
204 }
205 };
206
207 TEST_F(OSDMapTest, Create) {
208 set_up_map();
209 ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
210 ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
211 }
212
213 TEST_F(OSDMapTest, Features) {
214 // with EC pool
215 set_up_map();
216 uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
217 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
218 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
219 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
220 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
221 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
222 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
223
224 // clients have a slightly different view
225 features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
226 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
227 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
228 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
229 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
230 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
231 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
232
233 // remove teh EC pool, but leave the rule. add primary affinity.
234 {
235 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
236 new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
237 new_pool_inc.new_primary_affinity[0] = 0x8000;
238 osdmap.apply_incremental(new_pool_inc);
239 }
240
241 features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
242 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
243 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
244 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
245 ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
246 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
247 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
248
249 // FIXME: test tiering feature bits
250 }
251
252 TEST_F(OSDMapTest, MapPG) {
253 set_up_map();
254
255 std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl;
256 pg_t rawpg(0, my_rep_pool);
257 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
258 vector<int> up_osds, acting_osds;
259 int up_primary, acting_primary;
260
261 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
262 &acting_osds, &acting_primary);
263
264 vector<int> old_up_osds, old_acting_osds;
265 osdmap.pg_to_up_acting_osds(pgid, old_up_osds, old_acting_osds);
266 ASSERT_EQ(old_up_osds, up_osds);
267 ASSERT_EQ(old_acting_osds, acting_osds);
268
269 ASSERT_EQ(osdmap.get_pg_pool(my_rep_pool)->get_size(), up_osds.size());
270 }
271
272 TEST_F(OSDMapTest, MapFunctionsMatch) {
273 // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match
274 set_up_map();
275 pg_t rawpg(0, my_rep_pool);
276 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
277 vector<int> up_osds, acting_osds;
278 int up_primary, acting_primary;
279
280 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
281 &acting_osds, &acting_primary);
282
283 vector<int> up_osds_two, acting_osds_two;
284
285 osdmap.pg_to_up_acting_osds(pgid, up_osds_two, acting_osds_two);
286
287 ASSERT_EQ(up_osds, up_osds_two);
288 ASSERT_EQ(acting_osds, acting_osds_two);
289
290 int acting_primary_two;
291 osdmap.pg_to_acting_osds(pgid, &acting_osds_two, &acting_primary_two);
292 EXPECT_EQ(acting_osds, acting_osds_two);
293 EXPECT_EQ(acting_primary, acting_primary_two);
294 osdmap.pg_to_acting_osds(pgid, acting_osds_two);
295 EXPECT_EQ(acting_osds, acting_osds_two);
296 }
297
298 /** This test must be removed or modified appropriately when we allow
299 * other ways to specify a primary. */
300 TEST_F(OSDMapTest, PrimaryIsFirst) {
301 set_up_map();
302
303 pg_t rawpg(0, my_rep_pool);
304 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
305 vector<int> up_osds, acting_osds;
306 int up_primary, acting_primary;
307
308 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
309 &acting_osds, &acting_primary);
310 EXPECT_EQ(up_osds[0], up_primary);
311 EXPECT_EQ(acting_osds[0], acting_primary);
312 }
313
314 TEST_F(OSDMapTest, PGTempRespected) {
315 set_up_map();
316
317 pg_t rawpg(0, my_rep_pool);
318 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
319 vector<int> up_osds, acting_osds;
320 int up_primary, acting_primary;
321
322 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
323 &acting_osds, &acting_primary);
324
325 // copy and swap first and last element in acting_osds
326 vector<int> new_acting_osds(acting_osds);
327 int first = new_acting_osds[0];
328 new_acting_osds[0] = *new_acting_osds.rbegin();
329 *new_acting_osds.rbegin() = first;
330
331 // apply pg_temp to osdmap
332 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
333 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
334 new_acting_osds.begin(), new_acting_osds.end());
335 osdmap.apply_incremental(pgtemp_map);
336
337 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
338 &acting_osds, &acting_primary);
339 EXPECT_EQ(new_acting_osds, acting_osds);
340 }
341
342 TEST_F(OSDMapTest, PrimaryTempRespected) {
343 set_up_map();
344
345 pg_t rawpg(0, my_rep_pool);
346 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
347 vector<int> up_osds;
348 vector<int> acting_osds;
349 int up_primary, acting_primary;
350
351 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
352 &acting_osds, &acting_primary);
353
354 // make second OSD primary via incremental
355 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
356 pgtemp_map.new_primary_temp[pgid] = acting_osds[1];
357 osdmap.apply_incremental(pgtemp_map);
358
359 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
360 &acting_osds, &acting_primary);
361 EXPECT_EQ(acting_primary, acting_osds[1]);
362 }
363
364 TEST_F(OSDMapTest, CleanTemps) {
365 set_up_map();
366
367 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
368 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 2);
369 pg_t pga = osdmap.raw_pg_to_pg(pg_t(0, my_rep_pool));
370 {
371 vector<int> up_osds, acting_osds;
372 int up_primary, acting_primary;
373 osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary,
374 &acting_osds, &acting_primary);
375 pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector<int>(
376 up_osds.begin(), up_osds.end());
377 pgtemp_map.new_primary_temp[pga] = up_primary;
378 }
379 pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, my_rep_pool));
380 {
381 vector<int> up_osds, acting_osds;
382 int up_primary, acting_primary;
383 osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary,
384 &acting_osds, &acting_primary);
385 pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector<int>(
386 up_osds.begin(), up_osds.end());
387 pending_inc.new_primary_temp[pgb] = up_primary;
388 }
389
390 osdmap.apply_incremental(pgtemp_map);
391
392 OSDMap tmpmap;
393 tmpmap.deepish_copy_from(osdmap);
394 tmpmap.apply_incremental(pending_inc);
395 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
396
397 EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) &&
398 pending_inc.new_pg_temp[pga].size() == 0);
399 EXPECT_EQ(-1, pending_inc.new_primary_temp[pga]);
400
401 EXPECT_TRUE(!pending_inc.new_pg_temp.count(pgb) &&
402 !pending_inc.new_primary_temp.count(pgb));
403 }
404
405 TEST_F(OSDMapTest, KeepsNecessaryTemps) {
406 set_up_map();
407
408 pg_t rawpg(0, my_rep_pool);
409 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
410 vector<int> up_osds, acting_osds;
411 int up_primary, acting_primary;
412
413 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
414 &acting_osds, &acting_primary);
415
416 // find unused OSD and stick it in there
417 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
418 // find an unused osd and put it in place of the first one
419 int i = 0;
420 for(; i != (int)get_num_osds(); ++i) {
421 bool in_use = false;
422 for (vector<int>::iterator osd_it = up_osds.begin();
423 osd_it != up_osds.end();
424 ++osd_it) {
425 if (i == *osd_it) {
426 in_use = true;
427 break;
428 }
429 }
430 if (!in_use) {
431 up_osds[1] = i;
432 break;
433 }
434 }
435 if (i == (int)get_num_osds())
436 FAIL() << "did not find unused OSD for temp mapping";
437
438 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
439 up_osds.begin(), up_osds.end());
440 pgtemp_map.new_primary_temp[pgid] = up_osds[1];
441 osdmap.apply_incremental(pgtemp_map);
442
443 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
444
445 OSDMap tmpmap;
446 tmpmap.deepish_copy_from(osdmap);
447 tmpmap.apply_incremental(pending_inc);
448 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
449 EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid));
450 EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid));
451 }
452
453 TEST_F(OSDMapTest, PrimaryAffinity) {
454 set_up_map();
455
456 int n = get_num_osds();
457 for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
458 p != osdmap.get_pools().end();
459 ++p) {
460 int pool = p->first;
461 int expect_primary = 10000 / n;
462 cout << "pool " << pool << " size " << (int)p->second.size
463 << " expect_primary " << expect_primary << std::endl;
464 {
465 vector<int> any(n, 0);
466 vector<int> first(n, 0);
467 vector<int> primary(n, 0);
468 test_mappings(pool, 10000, &any, &first, &primary);
469 for (int i=0; i<n; ++i) {
470 ASSERT_LT(0, any[i]);
471 ASSERT_LT(0, first[i]);
472 ASSERT_LT(0, primary[i]);
473 }
474 }
475
476 osdmap.set_primary_affinity(0, 0);
477 osdmap.set_primary_affinity(1, 0);
478 {
479 vector<int> any(n, 0);
480 vector<int> first(n, 0);
481 vector<int> primary(n, 0);
482 test_mappings(pool, 10000, &any, &first, &primary);
483 for (int i=0; i<n; ++i) {
484 ASSERT_LT(0, any[i]);
485 if (i >= 2) {
486 ASSERT_LT(0, first[i]);
487 ASSERT_LT(0, primary[i]);
488 } else {
489 if (p->second.is_replicated()) {
490 ASSERT_EQ(0, first[i]);
491 }
492 ASSERT_EQ(0, primary[i]);
493 }
494 }
495 }
496
497 osdmap.set_primary_affinity(0, 0x8000);
498 osdmap.set_primary_affinity(1, 0);
499 {
500 vector<int> any(n, 0);
501 vector<int> first(n, 0);
502 vector<int> primary(n, 0);
503 test_mappings(pool, 10000, &any, &first, &primary);
504 int expect = (10000 / (n-2)) / 2; // half weight
505 cout << "expect " << expect << std::endl;
506 for (int i=0; i<n; ++i) {
507 ASSERT_LT(0, any[i]);
508 if (i >= 2) {
509 ASSERT_LT(0, first[i]);
510 ASSERT_LT(0, primary[i]);
511 } else if (i == 1) {
512 if (p->second.is_replicated()) {
513 ASSERT_EQ(0, first[i]);
514 }
515 ASSERT_EQ(0, primary[i]);
516 } else {
517 ASSERT_LT(expect *2/3, primary[0]);
518 ASSERT_GT(expect *4/3, primary[0]);
519 }
520 }
521 }
522
523 osdmap.set_primary_affinity(0, 0x10000);
524 osdmap.set_primary_affinity(1, 0x10000);
525 }
526 }
527
528 TEST_F(OSDMapTest, get_osd_crush_node_flags) {
529 set_up_map();
530
531 for (unsigned i=0; i<get_num_osds(); ++i) {
532 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(i));
533 }
534
535 OSDMap::Incremental inc(osdmap.get_epoch() + 1);
536 inc.new_crush_node_flags[-1] = 123u;
537 osdmap.apply_incremental(inc);
538 for (unsigned i=0; i<get_num_osds(); ++i) {
539 ASSERT_EQ(123u, osdmap.get_osd_crush_node_flags(i));
540 }
541 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
542
543 OSDMap::Incremental inc3(osdmap.get_epoch() + 1);
544 inc3.new_crush_node_flags[-1] = 456u;
545 osdmap.apply_incremental(inc3);
546 for (unsigned i=0; i<get_num_osds(); ++i) {
547 ASSERT_EQ(456u, osdmap.get_osd_crush_node_flags(i));
548 }
549 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
550
551 OSDMap::Incremental inc2(osdmap.get_epoch() + 1);
552 inc2.new_crush_node_flags[-1] = 0;
553 osdmap.apply_incremental(inc2);
554 for (unsigned i=0; i<get_num_osds(); ++i) {
555 ASSERT_EQ(0u, osdmap.get_crush_node_flags(i));
556 }
557 }
558
559 TEST_F(OSDMapTest, parse_osd_id_list) {
560 set_up_map();
561 set<int> out;
562 set<int> all;
563 osdmap.get_all_osds(all);
564
565 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout));
566 ASSERT_EQ(1u, out.size());
567 ASSERT_EQ(0, *out.begin());
568
569 ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout));
570 ASSERT_EQ(1u, out.size());
571 ASSERT_EQ(1, *out.begin());
572
573 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout));
574 ASSERT_EQ(2u, out.size());
575 ASSERT_EQ(0, *out.begin());
576 ASSERT_EQ(1, *out.rbegin());
577
578 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout));
579 ASSERT_EQ(2u, out.size());
580 ASSERT_EQ(0, *out.begin());
581 ASSERT_EQ(1, *out.rbegin());
582
583 ASSERT_EQ(0, osdmap.parse_osd_id_list({"*"}, &out, &cout));
584 ASSERT_EQ(all.size(), out.size());
585 ASSERT_EQ(all, out);
586
587 ASSERT_EQ(0, osdmap.parse_osd_id_list({"all"}, &out, &cout));
588 ASSERT_EQ(all, out);
589
590 ASSERT_EQ(0, osdmap.parse_osd_id_list({"any"}, &out, &cout));
591 ASSERT_EQ(all, out);
592
593 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"foo"}, &out, &cout));
594 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
595 }
596
597 TEST_F(OSDMapTest, CleanPGUpmaps) {
598 set_up_map();
599
600 // build a crush rule of type host
601 const int expected_host_num = 3;
602 int osd_per_host = get_num_osds() / expected_host_num;
603 ASSERT_GE(2, osd_per_host);
604 int index = 0;
605 for (int i = 0; i < (int)get_num_osds(); i++) {
606 if (i && i % osd_per_host == 0) {
607 ++index;
608 }
609 stringstream osd_name;
610 stringstream host_name;
611 vector<string> move_to;
612 osd_name << "osd." << i;
613 host_name << "host-" << index;
614 move_to.push_back("root=default");
615 string host_loc = "host=" + host_name.str();
616 move_to.push_back(host_loc);
617 int r = crush_move(osdmap, osd_name.str(), move_to);
618 ASSERT_EQ(0, r);
619 }
620 const string upmap_rule = "upmap";
621 int upmap_rule_no = crush_rule_create_replicated(
622 upmap_rule, "default", "host");
623 ASSERT_LT(0, upmap_rule_no);
624
625 // create a replicated pool which references the above rule
626 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
627 new_pool_inc.new_pool_max = osdmap.get_pool_max();
628 new_pool_inc.fsid = osdmap.get_fsid();
629 pg_pool_t empty;
630 uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
631 pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
632 p->size = 2;
633 p->set_pg_num(64);
634 p->set_pgp_num(64);
635 p->type = pg_pool_t::TYPE_REPLICATED;
636 p->crush_rule = upmap_rule_no;
637 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
638 new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
639 osdmap.apply_incremental(new_pool_inc);
640
641 pg_t rawpg(0, upmap_pool_id);
642 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
643 vector<int> up;
644 int up_primary;
645 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
646 ASSERT_LT(1U, up.size());
647 {
648 // validate we won't have two OSDs from a same host
649 int parent_0 = osdmap.crush->get_parent_of_type(up[0],
650 osdmap.crush->get_type_id("host"));
651 int parent_1 = osdmap.crush->get_parent_of_type(up[1],
652 osdmap.crush->get_type_id("host"));
653 ASSERT_TRUE(parent_0 != parent_1);
654 }
655
656 {
657 // cancel stale upmaps
658 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
659 int from = -1;
660 for (int i = 0; i < (int)get_num_osds(); i++) {
661 if (std::find(up.begin(), up.end(), i) == up.end()) {
662 from = i;
663 break;
664 }
665 }
666 ASSERT_TRUE(from >= 0);
667 int to = -1;
668 for (int i = 0; i < (int)get_num_osds(); i++) {
669 if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
670 to = i;
671 break;
672 }
673 }
674 ASSERT_TRUE(to >= 0);
675 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
676 new_pg_upmap_items.push_back(make_pair(from, to));
677 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
678 pending_inc.new_pg_upmap_items[pgid] =
679 mempool::osdmap::vector<pair<int32_t,int32_t>>(
680 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
681 OSDMap nextmap;
682 nextmap.deepish_copy_from(osdmap);
683 nextmap.apply_incremental(pending_inc);
684 ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
685 OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
686 clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc);
687 nextmap.apply_incremental(new_pending_inc);
688 ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
689 }
690
691 {
692 // https://tracker.ceph.com/issues/37493
693 pg_t ec_pg(0, my_ec_pool);
694 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
695 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
696 int from = -1;
697 int to = -1;
698 {
699 // insert a valid pg_upmap_item
700 vector<int> ec_up;
701 int ec_up_primary;
702 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
703 ASSERT_TRUE(!ec_up.empty());
704 from = *(ec_up.begin());
705 ASSERT_TRUE(from >= 0);
706 for (int i = 0; i < (int)get_num_osds(); i++) {
707 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
708 to = i;
709 break;
710 }
711 }
712 ASSERT_TRUE(to >= 0);
713 ASSERT_TRUE(from != to);
714 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
715 new_pg_upmap_items.push_back(make_pair(from, to));
716 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
717 pending_inc.new_pg_upmap_items[ec_pgid] =
718 mempool::osdmap::vector<pair<int32_t,int32_t>>(
719 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
720 tmpmap.deepish_copy_from(osdmap);
721 tmpmap.apply_incremental(pending_inc);
722 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
723 }
724 {
725 // mark one of the target OSDs of the above pg_upmap_item as down
726 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
727 pending_inc.new_state[to] = CEPH_OSD_UP;
728 tmpmap.apply_incremental(pending_inc);
729 ASSERT_TRUE(!tmpmap.is_up(to));
730 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
731 }
732 {
733 // confirm *clean_pg_upmaps* won't do anything bad
734 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
735 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
736 tmpmap.apply_incremental(pending_inc);
737 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
738 }
739 }
740
741 {
742 // http://tracker.ceph.com/issues/37501
743 pg_t ec_pg(0, my_ec_pool);
744 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
745 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
746 int from = -1;
747 int to = -1;
748 {
749 // insert a valid pg_upmap_item
750 vector<int> ec_up;
751 int ec_up_primary;
752 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
753 ASSERT_TRUE(!ec_up.empty());
754 from = *(ec_up.begin());
755 ASSERT_TRUE(from >= 0);
756 for (int i = 0; i < (int)get_num_osds(); i++) {
757 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
758 to = i;
759 break;
760 }
761 }
762 ASSERT_TRUE(to >= 0);
763 ASSERT_TRUE(from != to);
764 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
765 new_pg_upmap_items.push_back(make_pair(from, to));
766 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
767 pending_inc.new_pg_upmap_items[ec_pgid] =
768 mempool::osdmap::vector<pair<int32_t,int32_t>>(
769 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
770 tmpmap.deepish_copy_from(osdmap);
771 tmpmap.apply_incremental(pending_inc);
772 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
773 }
774 {
775 // mark one of the target OSDs of the above pg_upmap_item as out
776 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
777 pending_inc.new_weight[to] = CEPH_OSD_OUT;
778 tmpmap.apply_incremental(pending_inc);
779 ASSERT_TRUE(tmpmap.is_out(to));
780 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
781 }
782 {
783 // *clean_pg_upmaps* should be able to remove the above *bad* mapping
784 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
785 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
786 tmpmap.apply_incremental(pending_inc);
787 ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
788 }
789 }
790
791 {
792 // http://tracker.ceph.com/issues/37968
793
794 // build a temporary crush topology of 2 hosts, 3 osds per host
795 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
796 tmp.deepish_copy_from(osdmap);
797 const int expected_host_num = 2;
798 int osd_per_host = get_num_osds() / expected_host_num;
799 ASSERT_GE(osd_per_host, 3);
800 int index = 0;
801 for (int i = 0; i < (int)get_num_osds(); i++) {
802 if (i && i % osd_per_host == 0) {
803 ++index;
804 }
805 stringstream osd_name;
806 stringstream host_name;
807 vector<string> move_to;
808 osd_name << "osd." << i;
809 host_name << "host-" << index;
810 move_to.push_back("root=default");
811 string host_loc = "host=" + host_name.str();
812 move_to.push_back(host_loc);
813 auto r = crush_move(tmp, osd_name.str(), move_to);
814 ASSERT_EQ(0, r);
815 }
816
817 // build crush rule
818 CrushWrapper crush;
819 get_crush(tmp, crush);
820 string rule_name = "rule_37968";
821 int rule_type = pg_pool_t::TYPE_ERASURE;
822 ASSERT_TRUE(!crush.rule_exists(rule_name));
823 int rno;
824 for (rno = 0; rno < crush.get_max_rules(); rno++) {
825 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
826 break;
827 }
828 string root_name = "default";
829 int root = crush.get_item_id(root_name);
830 int min_size = 3;
831 int max_size = 4;
832 int steps = 6;
833 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
834 int step = 0;
835 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
836 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
837 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
838 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/);
839 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */);
840 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
841 ASSERT_TRUE(step == steps);
842 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
843 ASSERT_TRUE(r >= 0);
844 crush.set_rule_name(rno, rule_name);
845 {
846 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
847 pending_inc.crush.clear();
848 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
849 tmp.apply_incremental(pending_inc);
850 }
851
852 // create a erasuce-coded pool referencing the above rule
853 int64_t pool_37968;
854 {
855 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
856 new_pool_inc.new_pool_max = tmp.get_pool_max();
857 new_pool_inc.fsid = tmp.get_fsid();
858 pg_pool_t empty;
859 pool_37968 = ++new_pool_inc.new_pool_max;
860 pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
861 p->size = 4;
862 p->set_pg_num(8);
863 p->set_pgp_num(8);
864 p->type = pg_pool_t::TYPE_ERASURE;
865 p->crush_rule = rno;
866 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
867 new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
868 tmp.apply_incremental(new_pool_inc);
869 }
870
871 pg_t ec_pg(0, pool_37968);
872 pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
873 int from = -1;
874 int to = -1;
875 {
876 // insert a valid pg_upmap_item
877 vector<int> ec_up;
878 int ec_up_primary;
879 tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
880 ASSERT_TRUE(ec_up.size() == 4);
881 from = *(ec_up.begin());
882 ASSERT_TRUE(from >= 0);
883 auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
884 ASSERT_TRUE(parent < 0);
885 // pick an osd of the same parent with *from*
886 for (int i = 0; i < (int)get_num_osds(); i++) {
887 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
888 auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
889 if (p == parent) {
890 to = i;
891 break;
892 }
893 }
894 }
895 ASSERT_TRUE(to >= 0);
896 ASSERT_TRUE(from != to);
897 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
898 new_pg_upmap_items.push_back(make_pair(from, to));
899 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
900 pending_inc.new_pg_upmap_items[ec_pgid] =
901 mempool::osdmap::vector<pair<int32_t,int32_t>>(
902 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
903 tmp.apply_incremental(pending_inc);
904 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
905 }
906 {
907 // *clean_pg_upmaps* should not remove the above upmap_item
908 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
909 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
910 tmp.apply_incremental(pending_inc);
911 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
912 }
913 }
914
915 {
916 // TEST pg_upmap
917 {
918 // STEP-1: enumerate all children of up[0]'s parent,
919 // replace up[1] with one of them (other than up[0])
920 int parent = osdmap.crush->get_parent_of_type(up[0],
921 osdmap.crush->get_type_id("host"));
922 set<int> candidates;
923 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
924 ASSERT_LT(1U, candidates.size());
925 int replaced_by = -1;
926 for (auto c: candidates) {
927 if (c != up[0]) {
928 replaced_by = c;
929 break;
930 }
931 }
932 {
933 // Check we can handle a negative pg_upmap value
934 vector<int32_t> new_pg_upmap;
935 new_pg_upmap.push_back(up[0]);
936 new_pg_upmap.push_back(-823648512);
937 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
938 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
939 new_pg_upmap.begin(), new_pg_upmap.end());
940 osdmap.apply_incremental(pending_inc);
941 vector<int> new_up;
942 int new_up_primary;
943 // crucial call - _apply_upmap should ignore the negative value
944 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
945 }
946 ASSERT_NE(-1, replaced_by);
947 // generate a new pg_upmap item and apply
948 vector<int32_t> new_pg_upmap;
949 new_pg_upmap.push_back(up[0]);
950 new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
951 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
952 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
953 new_pg_upmap.begin(), new_pg_upmap.end());
954 osdmap.apply_incremental(pending_inc);
955 {
956 // validate pg_upmap is there
957 vector<int> new_up;
958 int new_up_primary;
959 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
960 ASSERT_TRUE(up.size() == new_up.size());
961 ASSERT_TRUE(new_up[0] == new_pg_upmap[0]);
962 ASSERT_TRUE(new_up[1] == new_pg_upmap[1]);
963 // and we shall have two OSDs from a same host now..
964 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
965 osdmap.crush->get_type_id("host"));
966 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
967 osdmap.crush->get_type_id("host"));
968 ASSERT_TRUE(parent_0 == parent_1);
969 }
970 }
971 {
972 // STEP-2: apply cure
973 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
974 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
975 osdmap.apply_incremental(pending_inc);
976 {
977 // validate pg_upmap is gone (reverted)
978 vector<int> new_up;
979 int new_up_primary;
980 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
981 ASSERT_TRUE(new_up == up);
982 ASSERT_TRUE(new_up_primary = up_primary);
983 }
984 }
985 }
986
987 {
988 // TEST pg_upmap_items
989 // enumerate all used hosts first
990 set<int> parents;
991 for (auto u: up) {
992 int parent = osdmap.crush->get_parent_of_type(u,
993 osdmap.crush->get_type_id("host"));
994 ASSERT_GT(0, parent);
995 parents.insert(parent);
996 }
997 int candidate_parent = 0;
998 set<int> candidate_children;
999 vector<int> up_after_out;
1000 {
1001 // STEP-1: try mark out up[1] and all other OSDs from the same host
1002 int parent = osdmap.crush->get_parent_of_type(up[1],
1003 osdmap.crush->get_type_id("host"));
1004 set<int> children;
1005 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1006 &children);
1007 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1008 for (auto c: children) {
1009 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1010 }
1011 OSDMap tmpmap;
1012 tmpmap.deepish_copy_from(osdmap);
1013 tmpmap.apply_incremental(pending_inc);
1014 vector<int> new_up;
1015 int new_up_primary;
1016 tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1017 // verify that we'll have OSDs from a different host..
1018 int will_choose = -1;
1019 for (auto o: new_up) {
1020 int parent = tmpmap.crush->get_parent_of_type(o,
1021 osdmap.crush->get_type_id("host"));
1022 if (!parents.count(parent)) {
1023 will_choose = o;
1024 candidate_parent = parent; // record
1025 break;
1026 }
1027 }
1028 ASSERT_LT(-1, will_choose); // it is an OSD!
1029 ASSERT_TRUE(candidate_parent != 0);
1030 osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
1031 &candidate_children);
1032 ASSERT_TRUE(candidate_children.count(will_choose));
1033 candidate_children.erase(will_choose);
1034 ASSERT_TRUE(!candidate_children.empty());
1035 up_after_out = new_up; // needed for verification..
1036 }
1037 {
1038 // Make sure we can handle a negative pg_upmap_item
1039 int victim = up[0];
1040 int replaced_by = -823648512;
1041 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1042 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1043 // apply
1044 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1045 pending_inc.new_pg_upmap_items[pgid] =
1046 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1047 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1048 osdmap.apply_incremental(pending_inc);
1049 vector<int> new_up;
1050 int new_up_primary;
1051 // crucial call - _apply_upmap should ignore the negative value
1052 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1053 }
1054 {
1055 // STEP-2: generating a new pg_upmap_items entry by
1056 // replacing up[0] with one coming from candidate_children
1057 int victim = up[0];
1058 int replaced_by = *candidate_children.begin();
1059 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1060 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1061 // apply
1062 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1063 pending_inc.new_pg_upmap_items[pgid] =
1064 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1065 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1066 osdmap.apply_incremental(pending_inc);
1067 {
1068 // validate pg_upmap_items is there
1069 vector<int> new_up;
1070 int new_up_primary;
1071 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1072 ASSERT_TRUE(up.size() == new_up.size());
1073 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
1074 new_up.end());
1075 // and up[1] too
1076 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
1077 new_up.end());
1078 }
1079 }
1080 {
1081 // STEP-3: mark out up[1] and all other OSDs from the same host
1082 int parent = osdmap.crush->get_parent_of_type(up[1],
1083 osdmap.crush->get_type_id("host"));
1084 set<int> children;
1085 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1086 &children);
1087 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1088 for (auto c: children) {
1089 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1090 }
1091 osdmap.apply_incremental(pending_inc);
1092 {
1093 // validate we have two OSDs from the same host now..
1094 vector<int> new_up;
1095 int new_up_primary;
1096 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1097 ASSERT_TRUE(up.size() == new_up.size());
1098 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
1099 osdmap.crush->get_type_id("host"));
1100 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
1101 osdmap.crush->get_type_id("host"));
1102 ASSERT_TRUE(parent_0 == parent_1);
1103 }
1104 }
1105 {
1106 // STEP-4: apply cure
1107 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1108 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1109 osdmap.apply_incremental(pending_inc);
1110 {
1111 // validate pg_upmap_items is gone (reverted)
1112 vector<int> new_up;
1113 int new_up_primary;
1114 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1115 ASSERT_TRUE(new_up == up_after_out);
1116 }
1117 }
1118 }
1119 }
1120
1121 TEST_F(OSDMapTest, BUG_38897) {
1122 // http://tracker.ceph.com/issues/38897
1123 // build a fresh map with 12 OSDs, without any default pools
1124 set_up_map(12, true);
1125 const string pool_1("pool1");
1126 const string pool_2("pool2");
1127 int64_t pool_1_id = -1;
1128
1129 {
1130 // build customized crush rule for "pool1"
1131 string host_name = "host_for_pool_1";
1132 // build a customized host to capture osd.1~5
1133 for (int i = 1; i < 5; i++) {
1134 stringstream osd_name;
1135 vector<string> move_to;
1136 osd_name << "osd." << i;
1137 move_to.push_back("root=default");
1138 string host_loc = "host=" + host_name;
1139 move_to.push_back(host_loc);
1140 auto r = crush_move(osdmap, osd_name.str(), move_to);
1141 ASSERT_EQ(0, r);
1142 }
1143 CrushWrapper crush;
1144 get_crush(osdmap, crush);
1145 auto host_id = crush.get_item_id(host_name);
1146 ASSERT_TRUE(host_id < 0);
1147 string rule_name = "rule_for_pool1";
1148 int rule_type = pg_pool_t::TYPE_REPLICATED;
1149 ASSERT_TRUE(!crush.rule_exists(rule_name));
1150 int rno;
1151 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1152 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1153 break;
1154 }
1155 int min_size = 3;
1156 int max_size = 3;
1157 int steps = 7;
1158 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1159 int step = 0;
1160 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1161 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1162 // always choose osd.0
1163 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1164 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1165 // then pick any other random osds
1166 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1167 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1168 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1169 ASSERT_TRUE(step == steps);
1170 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1171 ASSERT_TRUE(r >= 0);
1172 crush.set_rule_name(rno, rule_name);
1173 {
1174 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1175 pending_inc.crush.clear();
1176 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1177 osdmap.apply_incremental(pending_inc);
1178 }
1179
1180 // create "pool1"
1181 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1182 pending_inc.new_pool_max = osdmap.get_pool_max();
1183 auto pool_id = ++pending_inc.new_pool_max;
1184 pool_1_id = pool_id;
1185 pg_pool_t empty;
1186 auto p = pending_inc.get_new_pool(pool_id, &empty);
1187 p->size = 3;
1188 p->min_size = 1;
1189 p->set_pg_num(3);
1190 p->set_pgp_num(3);
1191 p->type = pg_pool_t::TYPE_REPLICATED;
1192 p->crush_rule = rno;
1193 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1194 pending_inc.new_pool_names[pool_id] = pool_1;
1195 osdmap.apply_incremental(pending_inc);
1196 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1197 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
1198 {
1199 for (unsigned i = 0; i < 3; i++) {
1200 // 1.x -> [1]
1201 pg_t rawpg(i, pool_id);
1202 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1203 vector<int> up;
1204 int up_primary;
1205 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1206 ASSERT_TRUE(up.size() == 3);
1207 ASSERT_TRUE(up[0] == 0);
1208
1209 // insert a new pg_upmap
1210 vector<int32_t> new_up;
1211 // and remap 1.x to osd.1 only
1212 // this way osd.0 is deemed to be *underfull*
1213 // and osd.1 is deemed to be *overfull*
1214 new_up.push_back(1);
1215 {
1216 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1217 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1218 new_up.begin(), new_up.end());
1219 osdmap.apply_incremental(pending_inc);
1220 }
1221 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1222 ASSERT_TRUE(up.size() == 1);
1223 ASSERT_TRUE(up[0] == 1);
1224 }
1225 }
1226 }
1227
1228 {
1229 // build customized crush rule for "pool2"
1230 string host_name = "host_for_pool_2";
1231 // build a customized host to capture osd.6~11
1232 for (int i = 6; i < (int)get_num_osds(); i++) {
1233 stringstream osd_name;
1234 vector<string> move_to;
1235 osd_name << "osd." << i;
1236 move_to.push_back("root=default");
1237 string host_loc = "host=" + host_name;
1238 move_to.push_back(host_loc);
1239 auto r = crush_move(osdmap, osd_name.str(), move_to);
1240 ASSERT_EQ(0, r);
1241 }
1242 CrushWrapper crush;
1243 get_crush(osdmap, crush);
1244 auto host_id = crush.get_item_id(host_name);
1245 ASSERT_TRUE(host_id < 0);
1246 string rule_name = "rule_for_pool2";
1247 int rule_type = pg_pool_t::TYPE_REPLICATED;
1248 ASSERT_TRUE(!crush.rule_exists(rule_name));
1249 int rno;
1250 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1251 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1252 break;
1253 }
1254 int min_size = 3;
1255 int max_size = 3;
1256 int steps = 7;
1257 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1258 int step = 0;
1259 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1260 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1261 // always choose osd.0
1262 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1263 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1264 // then pick any other random osds
1265 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1266 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1267 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1268 ASSERT_TRUE(step == steps);
1269 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1270 ASSERT_TRUE(r >= 0);
1271 crush.set_rule_name(rno, rule_name);
1272 {
1273 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1274 pending_inc.crush.clear();
1275 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1276 osdmap.apply_incremental(pending_inc);
1277 }
1278
1279 // create "pool2"
1280 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1281 pending_inc.new_pool_max = osdmap.get_pool_max();
1282 auto pool_id = ++pending_inc.new_pool_max;
1283 pg_pool_t empty;
1284 auto p = pending_inc.get_new_pool(pool_id, &empty);
1285 p->size = 3;
1286 // include a single PG
1287 p->set_pg_num(1);
1288 p->set_pgp_num(1);
1289 p->type = pg_pool_t::TYPE_REPLICATED;
1290 p->crush_rule = rno;
1291 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1292 pending_inc.new_pool_names[pool_id] = pool_2;
1293 osdmap.apply_incremental(pending_inc);
1294 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1295 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
1296 pg_t rawpg(0, pool_id);
1297 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1298 EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
1299 vector<int> up;
1300 int up_primary;
1301 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1302 ASSERT_TRUE(up.size() == 3);
1303 ASSERT_TRUE(up[0] == 0);
1304
1305 {
1306 // build a pg_upmap_item that will
1307 // remap pg out from *underfull* osd.0
1308 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1309 new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
1310 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1311 pending_inc.new_pg_upmap_items[pgid] =
1312 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1313 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1314 osdmap.apply_incremental(pending_inc);
1315 ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
1316 vector<int> up;
1317 int up_primary;
1318 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1319 ASSERT_TRUE(up.size() == 3);
1320 ASSERT_TRUE(up[0] == 10);
1321 }
1322 }
1323
1324 // ready to go
1325 {
1326 // require perfect distribution!
1327 auto ret = g_ceph_context->_conf.set_val(
1328 "osd_calc_pg_upmaps_max_stddev", "0");
1329 ASSERT_EQ(0, ret);
1330 g_ceph_context->_conf.apply_changes(nullptr);
1331 set<int64_t> only_pools;
1332 ASSERT_TRUE(pool_1_id >= 0);
1333 only_pools.insert(pool_1_id);
1334 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1335 osdmap.calc_pg_upmaps(g_ceph_context,
1336 0, // so we can force optimizing
1337 100,
1338 only_pools,
1339 &pending_inc);
1340 osdmap.apply_incremental(pending_inc);
1341 }
1342 }
1343
1344 TEST_F(OSDMapTest, BUG_40104) {
1345 // http://tracker.ceph.com/issues/40104
1346 int big_osd_num = 5000;
1347 int big_pg_num = 10000;
1348 set_up_map(big_osd_num, true);
1349 int pool_id;
1350 {
1351 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1352 pending_inc.new_pool_max = osdmap.get_pool_max();
1353 pool_id = ++pending_inc.new_pool_max;
1354 pg_pool_t empty;
1355 auto p = pending_inc.get_new_pool(pool_id, &empty);
1356 p->size = 3;
1357 p->min_size = 1;
1358 p->set_pg_num(big_pg_num);
1359 p->set_pgp_num(big_pg_num);
1360 p->type = pg_pool_t::TYPE_REPLICATED;
1361 p->crush_rule = 0;
1362 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1363 pending_inc.new_pool_names[pool_id] = "big_pool";
1364 osdmap.apply_incremental(pending_inc);
1365 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1366 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool");
1367 }
1368 {
1369 // generate pg_upmap_items for each pg
1370 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1371 for (int i = 0; i < big_pg_num; i++) {
1372 pg_t rawpg(i, pool_id);
1373 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1374 vector<int> up;
1375 int up_primary;
1376 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1377 ASSERT_TRUE(up.size() == 3);
1378 int victim = up[0];
1379 int replaced_by = random() % big_osd_num;
1380 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1381 // note that it might or might not be valid, we don't care
1382 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1383 pending_inc.new_pg_upmap_items[pgid] =
1384 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1385 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1386 }
1387 osdmap.apply_incremental(pending_inc);
1388 }
1389 {
1390 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1391 auto start = mono_clock::now();
1392 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1393 auto latency = mono_clock::now() - start;
1394 std::cout << "clean_pg_upmaps (~" << big_pg_num
1395 << " pg_upmap_items) latency:" << timespan_str(latency)
1396 << std::endl;
1397 }
1398 }
1399
1400 TEST_F(OSDMapTest, BUG_42052) {
1401 // https://tracker.ceph.com/issues/42052
1402 set_up_map(6, true);
1403 const string pool_name("pool");
1404 // build customized crush rule for "pool"
1405 CrushWrapper crush;
1406 get_crush(osdmap, crush);
1407 string rule_name = "rule";
1408 int rule_type = pg_pool_t::TYPE_REPLICATED;
1409 ASSERT_TRUE(!crush.rule_exists(rule_name));
1410 int rno;
1411 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1412 if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
1413 break;
1414 }
1415 int min_size = 3;
1416 int max_size = 3;
1417 int steps = 8;
1418 crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
1419 int step = 0;
1420 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1421 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1422 // always choose osd.0, osd.1, osd.2
1423 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1424 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1425 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 1);
1426 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1427 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 2);
1428 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1429 ASSERT_TRUE(step == steps);
1430 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1431 ASSERT_TRUE(r >= 0);
1432 crush.set_rule_name(rno, rule_name);
1433 {
1434 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1435 pending_inc.crush.clear();
1436 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1437 osdmap.apply_incremental(pending_inc);
1438 }
1439
1440 // create "pool"
1441 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1442 pending_inc.new_pool_max = osdmap.get_pool_max();
1443 auto pool_id = ++pending_inc.new_pool_max;
1444 pg_pool_t empty;
1445 auto p = pending_inc.get_new_pool(pool_id, &empty);
1446 p->size = 3;
1447 p->min_size = 1;
1448 p->set_pg_num(1);
1449 p->set_pgp_num(1);
1450 p->type = pg_pool_t::TYPE_REPLICATED;
1451 p->crush_rule = rno;
1452 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1453 pending_inc.new_pool_names[pool_id] = pool_name;
1454 osdmap.apply_incremental(pending_inc);
1455 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1456 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_name);
1457 pg_t rawpg(0, pool_id);
1458 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1459 {
1460 // pg_upmap 1.0 [2,3,5]
1461 vector<int32_t> new_up{2,3,5};
1462 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1463 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1464 new_up.begin(), new_up.end());
1465 osdmap.apply_incremental(pending_inc);
1466 }
1467 {
1468 // pg_upmap_items 1.0 [0,3,4,5]
1469 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1470 new_pg_upmap_items.push_back(make_pair(0, 3));
1471 new_pg_upmap_items.push_back(make_pair(4, 5));
1472 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1473 pending_inc.new_pg_upmap_items[pgid] =
1474 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1475 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1476 osdmap.apply_incremental(pending_inc);
1477 }
1478 {
1479 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1480 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1481 osdmap.apply_incremental(pending_inc);
1482 ASSERT_FALSE(osdmap.have_pg_upmaps(pgid));
1483 }
1484 }
1485
1486 TEST(PGTempMap, basic)
1487 {
1488 PGTempMap m;
1489 pg_t a(1,1);
1490 for (auto i=3; i<1000; ++i) {
1491 pg_t x(i, 1);
1492 m.set(x, {static_cast<int>(i)});
1493 }
1494 pg_t b(2,1);
1495 m.set(a, {1, 2});
1496 ASSERT_NE(m.find(a), m.end());
1497 ASSERT_EQ(m.find(a), m.begin());
1498 ASSERT_EQ(m.find(b), m.end());
1499 ASSERT_EQ(998u, m.size());
1500 }
1501