]> git.proxmox.com Git - ceph.git/blob - ceph/src/test/osd/TestOSDMap.cc
add stop-gap to fix compat with CPUs not supporting SSE 4.1
[ceph.git] / ceph / src / test / osd / TestOSDMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 #include "gtest/gtest.h"
3 #include "osd/OSDMap.h"
4 #include "osd/OSDMapMapping.h"
5 #include "mon/OSDMonitor.h"
6 #include "mon/PGMap.h"
7
8 #include "global/global_context.h"
9 #include "global/global_init.h"
10 #include "common/common_init.h"
11 #include "common/ceph_argparse.h"
12 #include "common/ceph_json.h"
13
14 #include <iostream>
15 #include <cmath>
16
17 using namespace std;
18
19 int main(int argc, char **argv) {
20 map<string,string> defaults = {
21 // make sure we have 3 copies, or some tests won't work
22 { "osd_pool_default_size", "3" },
23 // our map is flat, so just try and split across OSDs, not hosts or whatever
24 { "osd_crush_chooseleaf_type", "0" },
25 };
26 std::vector<const char*> args(argv, argv+argc);
27 auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
28 CODE_ENVIRONMENT_UTILITY,
29 CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
30 common_init_finish(g_ceph_context);
31 ::testing::InitGoogleTest(&argc, argv);
32 return RUN_ALL_TESTS();
33 }
34
35 class OSDMapTest : public testing::Test,
36 public ::testing::WithParamInterface<std::pair<int, int>> {
37 int num_osds = 6;
38 public:
39 OSDMap osdmap;
40 OSDMapMapping mapping;
41 const uint64_t my_ec_pool = 1;
42 const uint64_t my_rep_pool = 2;
43
44 // Blacklist testing lists
45 // I pulled the first two ranges and their start/end points from
46 // https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing#CIDR_notation
47 static const string range_addrs[];
48 static const string ip_addrs[];
49 static const string unblocked_ip_addrs[];
50 const string EC_RULE_NAME = "erasure";
51
52 OSDMapTest() {}
53
54 void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
55 num_osds = new_num_osds;
56 uuid_d fsid;
57 osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
58 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
59 pending_inc.fsid = osdmap.get_fsid();
60 entity_addrvec_t sample_addrs;
61 sample_addrs.v.push_back(entity_addr_t());
62 uuid_d sample_uuid;
63 for (int i = 0; i < num_osds; ++i) {
64 sample_uuid.generate_random();
65 sample_addrs.v[0].nonce = i;
66 pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW;
67 pending_inc.new_up_client[i] = sample_addrs;
68 pending_inc.new_up_cluster[i] = sample_addrs;
69 pending_inc.new_hb_back_up[i] = sample_addrs;
70 pending_inc.new_hb_front_up[i] = sample_addrs;
71 pending_inc.new_weight[i] = CEPH_OSD_IN;
72 pending_inc.new_uuid[i] = sample_uuid;
73 }
74 osdmap.apply_incremental(pending_inc);
75 if (no_default_pools) // do not create any default pool(s)
76 return;
77
78 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
79 new_pool_inc.new_pool_max = osdmap.get_pool_max();
80 new_pool_inc.fsid = osdmap.get_fsid();
81 // make an ec pool
82 set_ec_pool("ec", new_pool_inc);
83 // and a replicated pool
84 set_rep_pool("reppool",new_pool_inc);
85 osdmap.apply_incremental(new_pool_inc);
86 }
87 int get_ec_crush_rule() {
88 int r = osdmap.crush->get_rule_id(EC_RULE_NAME);
89 if (r < 0) {
90 r = osdmap.crush->add_simple_rule(
91 EC_RULE_NAME, "default", "osd", "",
92 "indep", pg_pool_t::TYPE_ERASURE,
93 &cerr);
94 }
95 return r;
96 }
97 uint64_t set_ec_pool(const string &name, OSDMap::Incremental &new_pool_inc,
98 bool assert_pool_id = true) {
99 pg_pool_t empty;
100 uint64_t pool_id = ++new_pool_inc.new_pool_max;
101 if (assert_pool_id)
102 ceph_assert(pool_id == my_ec_pool);
103 pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
104 p->size = 3;
105 p->set_pg_num(64);
106 p->set_pgp_num(64);
107 p->type = pg_pool_t::TYPE_ERASURE;
108 p->crush_rule = get_ec_crush_rule();
109 new_pool_inc.new_pool_names[pool_id] = name;//"ec";
110 return pool_id;
111 }
112 uint64_t set_rep_pool(const string name, OSDMap::Incremental &new_pool_inc,
113 bool assert_pool_id = true) {
114 pg_pool_t empty;
115 uint64_t pool_id = ++new_pool_inc.new_pool_max;
116 if (assert_pool_id)
117 ceph_assert(pool_id == my_rep_pool);
118 pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
119 p->size = 3;
120 p->set_pg_num(64);
121 p->set_pgp_num(64);
122 p->type = pg_pool_t::TYPE_REPLICATED;
123 p->crush_rule = 0;
124 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
125 new_pool_inc.new_pool_names[pool_id] = name;//"reppool";
126 return pool_id;
127 }
128
129 unsigned int get_num_osds() { return num_osds; }
130 void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
131 bufferlist bl;
132 tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
133 auto p = bl.cbegin();
134 newcrush.decode(p);
135 }
136 int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
137 map<string,string> loc;
138 CrushWrapper::parse_loc_map(argvec, &loc);
139 CrushWrapper newcrush;
140 get_crush(tmap, newcrush);
141 if (!newcrush.name_exists(name)) {
142 return -ENOENT;
143 }
144 int id = newcrush.get_item_id(name);
145 int err;
146 if (!newcrush.check_item_loc(g_ceph_context, id, loc, (int *)NULL)) {
147 if (id >= 0) {
148 err = newcrush.create_or_move_item(g_ceph_context, id, 0, name, loc);
149 } else {
150 err = newcrush.move_bucket(g_ceph_context, id, loc);
151 }
152 if (err >= 0) {
153 OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
154 pending_inc.crush.clear();
155 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
156 tmap.apply_incremental(pending_inc);
157 err = 0;
158 }
159 } else {
160 // already there
161 err = 0;
162 }
163 return err;
164 }
165 int crush_rule_create_replicated(const string &name,
166 const string &root,
167 const string &type) {
168 if (osdmap.crush->rule_exists(name)) {
169 return osdmap.crush->get_rule_id(name);
170 }
171 CrushWrapper newcrush;
172 get_crush(osdmap, newcrush);
173 string device_class;
174 stringstream ss;
175 int ruleno = newcrush.add_simple_rule(
176 name, root, type, device_class,
177 "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
178 if (ruleno >= 0) {
179 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
180 pending_inc.crush.clear();
181 newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
182 osdmap.apply_incremental(pending_inc);
183 }
184 return ruleno;
185 }
186 void test_mappings(int pool,
187 int num,
188 vector<int> *any,
189 vector<int> *first,
190 vector<int> *primary) {
191 mapping.update(osdmap);
192 for (int i=0; i<num; ++i) {
193 vector<int> up, acting;
194 int up_primary, acting_primary;
195 pg_t pgid(i, pool);
196 osdmap.pg_to_up_acting_osds(pgid,
197 &up, &up_primary, &acting, &acting_primary);
198 for (unsigned j=0; j<acting.size(); ++j)
199 (*any)[acting[j]]++;
200 if (!acting.empty())
201 (*first)[acting[0]]++;
202 if (acting_primary >= 0)
203 (*primary)[acting_primary]++;
204
205 // compare to precalc mapping
206 vector<int> up2, acting2;
207 int up_primary2, acting_primary2;
208 pgid = osdmap.raw_pg_to_pg(pgid);
209 mapping.get(pgid, &up2, &up_primary2, &acting2, &acting_primary2);
210 ASSERT_EQ(up, up2);
211 ASSERT_EQ(up_primary, up_primary2);
212 ASSERT_EQ(acting, acting2);
213 ASSERT_EQ(acting_primary, acting_primary2);
214 }
215 cout << "any: " << *any << std::endl;;
216 cout << "first: " << *first << std::endl;;
217 cout << "primary: " << *primary << std::endl;;
218 }
219 void clean_pg_upmaps(CephContext *cct,
220 const OSDMap& om,
221 OSDMap::Incremental& pending_inc) {
222 int cpu_num = 8;
223 int pgs_per_chunk = 256;
224 ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num);
225 tp.start();
226 ParallelPGMapper mapper(cct, &tp);
227 vector<pg_t> pgs_to_check;
228 om.get_upmap_pgs(&pgs_to_check);
229 OSDMonitor::CleanUpmapJob job(cct, om, pending_inc);
230 mapper.queue(&job, pgs_per_chunk, pgs_to_check);
231 job.wait();
232 tp.stop();
233 }
234 void set_primary_affinity_all(float pa) {
235 for (uint i = 0 ; i < get_num_osds() ; i++) {
236 osdmap.set_primary_affinity(i, int(pa * CEPH_OSD_MAX_PRIMARY_AFFINITY));
237 }
238 }
239 bool score_in_range(float score, uint nosds = 0) {
240 if (nosds == 0) {
241 nosds = get_num_osds();
242 }
243 return score >= 1.0 && score <= float(nosds);
244 }
245 };
246
247 TEST_F(OSDMapTest, Create) {
248 set_up_map();
249 ASSERT_EQ(get_num_osds(), (unsigned)osdmap.get_max_osd());
250 ASSERT_EQ(get_num_osds(), osdmap.get_num_in_osds());
251 }
252
253 TEST_F(OSDMapTest, Features) {
254 // with EC pool
255 set_up_map();
256 uint64_t features = osdmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
257 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
258 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
259 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
260 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
261 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
262 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
263
264 // clients have a slightly different view
265 features = osdmap.get_features(CEPH_ENTITY_TYPE_CLIENT, NULL);
266 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
267 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
268 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
269 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
270 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
271 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
272
273 // remove teh EC pool, but leave the rule. add primary affinity.
274 {
275 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
276 new_pool_inc.old_pools.insert(osdmap.lookup_pg_pool_name("ec"));
277 new_pool_inc.new_primary_affinity[0] = 0x8000;
278 osdmap.apply_incremental(new_pool_inc);
279 }
280
281 features = osdmap.get_features(CEPH_ENTITY_TYPE_MON, NULL);
282 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES);
283 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
284 ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
285 ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
286 ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
287 ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
288
289 // FIXME: test tiering feature bits
290 }
291
292 TEST_F(OSDMapTest, MapPG) {
293 set_up_map();
294
295 std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl;
296 pg_t rawpg(0, my_rep_pool);
297 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
298 vector<int> up_osds, acting_osds;
299 int up_primary, acting_primary;
300
301 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
302 &acting_osds, &acting_primary);
303
304 vector<int> old_up_osds, old_acting_osds;
305 osdmap.pg_to_up_acting_osds(pgid, old_up_osds, old_acting_osds);
306 ASSERT_EQ(old_up_osds, up_osds);
307 ASSERT_EQ(old_acting_osds, acting_osds);
308
309 ASSERT_EQ(osdmap.get_pg_pool(my_rep_pool)->get_size(), up_osds.size());
310 }
311
312 TEST_F(OSDMapTest, MapFunctionsMatch) {
313 // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match
314 set_up_map();
315 pg_t rawpg(0, my_rep_pool);
316 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
317 vector<int> up_osds, acting_osds;
318 int up_primary, acting_primary;
319
320 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
321 &acting_osds, &acting_primary);
322
323 vector<int> up_osds_two, acting_osds_two;
324
325 osdmap.pg_to_up_acting_osds(pgid, up_osds_two, acting_osds_two);
326
327 ASSERT_EQ(up_osds, up_osds_two);
328 ASSERT_EQ(acting_osds, acting_osds_two);
329
330 int acting_primary_two;
331 osdmap.pg_to_acting_osds(pgid, &acting_osds_two, &acting_primary_two);
332 EXPECT_EQ(acting_osds, acting_osds_two);
333 EXPECT_EQ(acting_primary, acting_primary_two);
334 osdmap.pg_to_acting_osds(pgid, acting_osds_two);
335 EXPECT_EQ(acting_osds, acting_osds_two);
336 }
337
338 /** This test must be removed or modified appropriately when we allow
339 * other ways to specify a primary. */
340 TEST_F(OSDMapTest, PrimaryIsFirst) {
341 set_up_map();
342
343 pg_t rawpg(0, my_rep_pool);
344 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
345 vector<int> up_osds, acting_osds;
346 int up_primary, acting_primary;
347
348 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
349 &acting_osds, &acting_primary);
350 EXPECT_EQ(up_osds[0], up_primary);
351 EXPECT_EQ(acting_osds[0], acting_primary);
352 }
353
354 TEST_F(OSDMapTest, PGTempRespected) {
355 set_up_map();
356
357 pg_t rawpg(0, my_rep_pool);
358 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
359 vector<int> up_osds, acting_osds;
360 int up_primary, acting_primary;
361
362 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
363 &acting_osds, &acting_primary);
364
365 // copy and swap first and last element in acting_osds
366 vector<int> new_acting_osds(acting_osds);
367 int first = new_acting_osds[0];
368 new_acting_osds[0] = *new_acting_osds.rbegin();
369 *new_acting_osds.rbegin() = first;
370
371 // apply pg_temp to osdmap
372 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
373 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
374 new_acting_osds.begin(), new_acting_osds.end());
375 osdmap.apply_incremental(pgtemp_map);
376
377 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
378 &acting_osds, &acting_primary);
379 EXPECT_EQ(new_acting_osds, acting_osds);
380 }
381
382 TEST_F(OSDMapTest, PrimaryTempRespected) {
383 set_up_map();
384
385 pg_t rawpg(0, my_rep_pool);
386 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
387 vector<int> up_osds;
388 vector<int> acting_osds;
389 int up_primary, acting_primary;
390
391 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
392 &acting_osds, &acting_primary);
393
394 // make second OSD primary via incremental
395 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
396 pgtemp_map.new_primary_temp[pgid] = acting_osds[1];
397 osdmap.apply_incremental(pgtemp_map);
398
399 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
400 &acting_osds, &acting_primary);
401 EXPECT_EQ(acting_primary, acting_osds[1]);
402 }
403
404 TEST_F(OSDMapTest, CleanTemps) {
405 set_up_map();
406
407 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
408 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 2);
409 pg_t pga = osdmap.raw_pg_to_pg(pg_t(0, my_rep_pool));
410 {
411 vector<int> up_osds, acting_osds;
412 int up_primary, acting_primary;
413 osdmap.pg_to_up_acting_osds(pga, &up_osds, &up_primary,
414 &acting_osds, &acting_primary);
415 pgtemp_map.new_pg_temp[pga] = mempool::osdmap::vector<int>(
416 up_osds.begin(), up_osds.end());
417 pgtemp_map.new_primary_temp[pga] = up_primary;
418 }
419 pg_t pgb = osdmap.raw_pg_to_pg(pg_t(1, my_rep_pool));
420 {
421 vector<int> up_osds, acting_osds;
422 int up_primary, acting_primary;
423 osdmap.pg_to_up_acting_osds(pgb, &up_osds, &up_primary,
424 &acting_osds, &acting_primary);
425 pending_inc.new_pg_temp[pgb] = mempool::osdmap::vector<int>(
426 up_osds.begin(), up_osds.end());
427 pending_inc.new_primary_temp[pgb] = up_primary;
428 }
429
430 osdmap.apply_incremental(pgtemp_map);
431
432 OSDMap tmpmap;
433 tmpmap.deepish_copy_from(osdmap);
434 tmpmap.apply_incremental(pending_inc);
435 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
436
437 EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) &&
438 pending_inc.new_pg_temp[pga].size() == 0);
439 EXPECT_EQ(-1, pending_inc.new_primary_temp[pga]);
440
441 EXPECT_TRUE(!pending_inc.new_pg_temp.count(pgb) &&
442 !pending_inc.new_primary_temp.count(pgb));
443 }
444
445 TEST_F(OSDMapTest, KeepsNecessaryTemps) {
446 set_up_map();
447
448 pg_t rawpg(0, my_rep_pool);
449 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
450 vector<int> up_osds, acting_osds;
451 int up_primary, acting_primary;
452
453 osdmap.pg_to_up_acting_osds(pgid, &up_osds, &up_primary,
454 &acting_osds, &acting_primary);
455
456 // find unused OSD and stick it in there
457 OSDMap::Incremental pgtemp_map(osdmap.get_epoch() + 1);
458 // find an unused osd and put it in place of the first one
459 int i = 0;
460 for(; i != (int)get_num_osds(); ++i) {
461 bool in_use = false;
462 for (vector<int>::iterator osd_it = up_osds.begin();
463 osd_it != up_osds.end();
464 ++osd_it) {
465 if (i == *osd_it) {
466 in_use = true;
467 break;
468 }
469 }
470 if (!in_use) {
471 up_osds[1] = i;
472 break;
473 }
474 }
475 if (i == (int)get_num_osds())
476 FAIL() << "did not find unused OSD for temp mapping";
477
478 pgtemp_map.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
479 up_osds.begin(), up_osds.end());
480 pgtemp_map.new_primary_temp[pgid] = up_osds[1];
481 osdmap.apply_incremental(pgtemp_map);
482
483 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
484
485 OSDMap tmpmap;
486 tmpmap.deepish_copy_from(osdmap);
487 tmpmap.apply_incremental(pending_inc);
488 OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
489 EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid));
490 EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid));
491 }
492
493 TEST_F(OSDMapTest, PrimaryAffinity) {
494 set_up_map();
495
496 int n = get_num_osds();
497 for (map<int64_t,pg_pool_t>::const_iterator p = osdmap.get_pools().begin();
498 p != osdmap.get_pools().end();
499 ++p) {
500 int pool = p->first;
501 int expect_primary = 10000 / n;
502 cout << "pool " << pool << " size " << (int)p->second.size
503 << " expect_primary " << expect_primary << std::endl;
504 {
505 vector<int> any(n, 0);
506 vector<int> first(n, 0);
507 vector<int> primary(n, 0);
508 test_mappings(pool, 10000, &any, &first, &primary);
509 for (int i=0; i<n; ++i) {
510 ASSERT_LT(0, any[i]);
511 ASSERT_LT(0, first[i]);
512 ASSERT_LT(0, primary[i]);
513 }
514 }
515
516 osdmap.set_primary_affinity(0, 0);
517 osdmap.set_primary_affinity(1, 0);
518 {
519 vector<int> any(n, 0);
520 vector<int> first(n, 0);
521 vector<int> primary(n, 0);
522 test_mappings(pool, 10000, &any, &first, &primary);
523 for (int i=0; i<n; ++i) {
524 ASSERT_LT(0, any[i]);
525 if (i >= 2) {
526 ASSERT_LT(0, first[i]);
527 ASSERT_LT(0, primary[i]);
528 } else {
529 if (p->second.is_replicated()) {
530 ASSERT_EQ(0, first[i]);
531 }
532 ASSERT_EQ(0, primary[i]);
533 }
534 }
535 }
536
537 osdmap.set_primary_affinity(0, 0x8000);
538 osdmap.set_primary_affinity(1, 0);
539 {
540 vector<int> any(n, 0);
541 vector<int> first(n, 0);
542 vector<int> primary(n, 0);
543 test_mappings(pool, 10000, &any, &first, &primary);
544 int expect = (10000 / (n-2)) / 2; // half weight
545 cout << "expect " << expect << std::endl;
546 for (int i=0; i<n; ++i) {
547 ASSERT_LT(0, any[i]);
548 if (i >= 2) {
549 ASSERT_LT(0, first[i]);
550 ASSERT_LT(0, primary[i]);
551 } else if (i == 1) {
552 if (p->second.is_replicated()) {
553 ASSERT_EQ(0, first[i]);
554 }
555 ASSERT_EQ(0, primary[i]);
556 } else {
557 ASSERT_LT(expect *2/3, primary[0]);
558 ASSERT_GT(expect *4/3, primary[0]);
559 }
560 }
561 }
562
563 osdmap.set_primary_affinity(0, 0x10000);
564 osdmap.set_primary_affinity(1, 0x10000);
565 }
566 }
567
568 TEST_F(OSDMapTest, get_osd_crush_node_flags) {
569 set_up_map();
570
571 for (unsigned i=0; i<get_num_osds(); ++i) {
572 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(i));
573 }
574
575 OSDMap::Incremental inc(osdmap.get_epoch() + 1);
576 inc.new_crush_node_flags[-1] = 123u;
577 osdmap.apply_incremental(inc);
578 for (unsigned i=0; i<get_num_osds(); ++i) {
579 ASSERT_EQ(123u, osdmap.get_osd_crush_node_flags(i));
580 }
581 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
582
583 OSDMap::Incremental inc3(osdmap.get_epoch() + 1);
584 inc3.new_crush_node_flags[-1] = 456u;
585 osdmap.apply_incremental(inc3);
586 for (unsigned i=0; i<get_num_osds(); ++i) {
587 ASSERT_EQ(456u, osdmap.get_osd_crush_node_flags(i));
588 }
589 ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
590
591 OSDMap::Incremental inc2(osdmap.get_epoch() + 1);
592 inc2.new_crush_node_flags[-1] = 0;
593 osdmap.apply_incremental(inc2);
594 for (unsigned i=0; i<get_num_osds(); ++i) {
595 ASSERT_EQ(0u, osdmap.get_crush_node_flags(i));
596 }
597 }
598
599 TEST_F(OSDMapTest, parse_osd_id_list) {
600 set_up_map();
601 set<int> out;
602 set<int> all;
603 osdmap.get_all_osds(all);
604
605 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout));
606 ASSERT_EQ(1u, out.size());
607 ASSERT_EQ(0, *out.begin());
608
609 ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout));
610 ASSERT_EQ(1u, out.size());
611 ASSERT_EQ(1, *out.begin());
612
613 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout));
614 ASSERT_EQ(2u, out.size());
615 ASSERT_EQ(0, *out.begin());
616 ASSERT_EQ(1, *out.rbegin());
617
618 ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout));
619 ASSERT_EQ(2u, out.size());
620 ASSERT_EQ(0, *out.begin());
621 ASSERT_EQ(1, *out.rbegin());
622
623 ASSERT_EQ(0, osdmap.parse_osd_id_list({"*"}, &out, &cout));
624 ASSERT_EQ(all.size(), out.size());
625 ASSERT_EQ(all, out);
626
627 ASSERT_EQ(0, osdmap.parse_osd_id_list({"all"}, &out, &cout));
628 ASSERT_EQ(all, out);
629
630 ASSERT_EQ(0, osdmap.parse_osd_id_list({"any"}, &out, &cout));
631 ASSERT_EQ(all, out);
632
633 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"foo"}, &out, &cout));
634 ASSERT_EQ(-EINVAL, osdmap.parse_osd_id_list({"-12"}, &out, &cout));
635 }
636
637 TEST_F(OSDMapTest, CleanPGUpmaps) {
638 set_up_map();
639
640 // build a crush rule of type host
641 const int expected_host_num = 3;
642 int osd_per_host = get_num_osds() / expected_host_num;
643 ASSERT_GE(2, osd_per_host);
644 int index = 0;
645 for (int i = 0; i < (int)get_num_osds(); i++) {
646 if (i && i % osd_per_host == 0) {
647 ++index;
648 }
649 stringstream osd_name;
650 stringstream host_name;
651 vector<string> move_to;
652 osd_name << "osd." << i;
653 host_name << "host-" << index;
654 move_to.push_back("root=default");
655 string host_loc = "host=" + host_name.str();
656 move_to.push_back(host_loc);
657 int r = crush_move(osdmap, osd_name.str(), move_to);
658 ASSERT_EQ(0, r);
659 }
660 const string upmap_rule = "upmap";
661 int upmap_rule_no = crush_rule_create_replicated(
662 upmap_rule, "default", "host");
663 ASSERT_LT(0, upmap_rule_no);
664
665 // create a replicated pool which references the above rule
666 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
667 new_pool_inc.new_pool_max = osdmap.get_pool_max();
668 new_pool_inc.fsid = osdmap.get_fsid();
669 pg_pool_t empty;
670 uint64_t upmap_pool_id = ++new_pool_inc.new_pool_max;
671 pg_pool_t *p = new_pool_inc.get_new_pool(upmap_pool_id, &empty);
672 p->size = 2;
673 p->set_pg_num(64);
674 p->set_pgp_num(64);
675 p->type = pg_pool_t::TYPE_REPLICATED;
676 p->crush_rule = upmap_rule_no;
677 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
678 new_pool_inc.new_pool_names[upmap_pool_id] = "upmap_pool";
679 osdmap.apply_incremental(new_pool_inc);
680
681 pg_t rawpg(0, upmap_pool_id);
682 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
683 vector<int> up;
684 int up_primary;
685 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
686 ASSERT_LT(1U, up.size());
687 {
688 // validate we won't have two OSDs from a same host
689 int parent_0 = osdmap.crush->get_parent_of_type(up[0],
690 osdmap.crush->get_type_id("host"));
691 int parent_1 = osdmap.crush->get_parent_of_type(up[1],
692 osdmap.crush->get_type_id("host"));
693 ASSERT_TRUE(parent_0 != parent_1);
694 }
695
696 {
697 // cancel stale upmaps
698 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
699 int from = -1;
700 for (int i = 0; i < (int)get_num_osds(); i++) {
701 if (std::find(up.begin(), up.end(), i) == up.end()) {
702 from = i;
703 break;
704 }
705 }
706 ASSERT_TRUE(from >= 0);
707 int to = -1;
708 for (int i = 0; i < (int)get_num_osds(); i++) {
709 if (std::find(up.begin(), up.end(), i) == up.end() && i != from) {
710 to = i;
711 break;
712 }
713 }
714 ASSERT_TRUE(to >= 0);
715 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
716 new_pg_upmap_items.push_back(make_pair(from, to));
717 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
718 pending_inc.new_pg_upmap_items[pgid] =
719 mempool::osdmap::vector<pair<int32_t,int32_t>>(
720 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
721 OSDMap nextmap;
722 nextmap.deepish_copy_from(osdmap);
723 nextmap.apply_incremental(pending_inc);
724 ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
725 OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
726 clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc);
727 nextmap.apply_incremental(new_pending_inc);
728 ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
729 }
730
731 {
732 // https://tracker.ceph.com/issues/37493
733 pg_t ec_pg(0, my_ec_pool);
734 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
735 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
736 int from = -1;
737 int to = -1;
738 {
739 // insert a valid pg_upmap_item
740 vector<int> ec_up;
741 int ec_up_primary;
742 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
743 ASSERT_TRUE(!ec_up.empty());
744 from = *(ec_up.begin());
745 ASSERT_TRUE(from >= 0);
746 for (int i = 0; i < (int)get_num_osds(); i++) {
747 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
748 to = i;
749 break;
750 }
751 }
752 ASSERT_TRUE(to >= 0);
753 ASSERT_TRUE(from != to);
754 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
755 new_pg_upmap_items.push_back(make_pair(from, to));
756 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
757 pending_inc.new_pg_upmap_items[ec_pgid] =
758 mempool::osdmap::vector<pair<int32_t,int32_t>>(
759 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
760 tmpmap.deepish_copy_from(osdmap);
761 tmpmap.apply_incremental(pending_inc);
762 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
763 }
764 {
765 // mark one of the target OSDs of the above pg_upmap_item as down
766 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
767 pending_inc.new_state[to] = CEPH_OSD_UP;
768 tmpmap.apply_incremental(pending_inc);
769 ASSERT_TRUE(!tmpmap.is_up(to));
770 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
771 }
772 {
773 // confirm *clean_pg_upmaps* won't do anything bad
774 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
775 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
776 tmpmap.apply_incremental(pending_inc);
777 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
778 }
779 }
780
781 {
782 // http://tracker.ceph.com/issues/37501
783 pg_t ec_pg(0, my_ec_pool);
784 pg_t ec_pgid = osdmap.raw_pg_to_pg(ec_pg);
785 OSDMap tmpmap; // use a tmpmap here, so we do not dirty origin map..
786 int from = -1;
787 int to = -1;
788 {
789 // insert a valid pg_upmap_item
790 vector<int> ec_up;
791 int ec_up_primary;
792 osdmap.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
793 ASSERT_TRUE(!ec_up.empty());
794 from = *(ec_up.begin());
795 ASSERT_TRUE(from >= 0);
796 for (int i = 0; i < (int)get_num_osds(); i++) {
797 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
798 to = i;
799 break;
800 }
801 }
802 ASSERT_TRUE(to >= 0);
803 ASSERT_TRUE(from != to);
804 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
805 new_pg_upmap_items.push_back(make_pair(from, to));
806 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
807 pending_inc.new_pg_upmap_items[ec_pgid] =
808 mempool::osdmap::vector<pair<int32_t,int32_t>>(
809 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
810 tmpmap.deepish_copy_from(osdmap);
811 tmpmap.apply_incremental(pending_inc);
812 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
813 }
814 {
815 // mark one of the target OSDs of the above pg_upmap_item as out
816 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
817 pending_inc.new_weight[to] = CEPH_OSD_OUT;
818 tmpmap.apply_incremental(pending_inc);
819 ASSERT_TRUE(tmpmap.is_out(to));
820 ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
821 }
822 {
823 // *clean_pg_upmaps* should be able to remove the above *bad* mapping
824 OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
825 clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
826 tmpmap.apply_incremental(pending_inc);
827 ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
828 }
829 }
830
831 {
832 // http://tracker.ceph.com/issues/37968
833
834 // build a temporary crush topology of 2 hosts, 3 osds per host
835 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
836 tmp.deepish_copy_from(osdmap);
837 const int expected_host_num = 2;
838 int osd_per_host = get_num_osds() / expected_host_num;
839 ASSERT_GE(osd_per_host, 3);
840 int index = 0;
841 for (int i = 0; i < (int)get_num_osds(); i++) {
842 if (i && i % osd_per_host == 0) {
843 ++index;
844 }
845 stringstream osd_name;
846 stringstream host_name;
847 vector<string> move_to;
848 osd_name << "osd." << i;
849 host_name << "host-" << index;
850 move_to.push_back("root=default");
851 string host_loc = "host=" + host_name.str();
852 move_to.push_back(host_loc);
853 auto r = crush_move(tmp, osd_name.str(), move_to);
854 ASSERT_EQ(0, r);
855 }
856
857 // build crush rule
858 CrushWrapper crush;
859 get_crush(tmp, crush);
860 string rule_name = "rule_37968";
861 int rule_type = pg_pool_t::TYPE_ERASURE;
862 ASSERT_TRUE(!crush.rule_exists(rule_name));
863 int rno;
864 for (rno = 0; rno < crush.get_max_rules(); rno++) {
865 if (!crush.rule_exists(rno))
866 break;
867 }
868 string root_name = "default";
869 int root = crush.get_item_id(root_name);
870 int steps = 6;
871 crush_rule *rule = crush_make_rule(steps, rule_type);
872 int step = 0;
873 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
874 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
875 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
876 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/);
877 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */);
878 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
879 ASSERT_TRUE(step == steps);
880 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
881 ASSERT_TRUE(r >= 0);
882 crush.set_rule_name(rno, rule_name);
883 {
884 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
885 pending_inc.crush.clear();
886 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
887 tmp.apply_incremental(pending_inc);
888 }
889
890 // create a erasuce-coded pool referencing the above rule
891 int64_t pool_37968;
892 {
893 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
894 new_pool_inc.new_pool_max = tmp.get_pool_max();
895 new_pool_inc.fsid = tmp.get_fsid();
896 pg_pool_t empty;
897 pool_37968 = ++new_pool_inc.new_pool_max;
898 pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
899 p->size = 4;
900 p->set_pg_num(8);
901 p->set_pgp_num(8);
902 p->type = pg_pool_t::TYPE_ERASURE;
903 p->crush_rule = rno;
904 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
905 new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
906 tmp.apply_incremental(new_pool_inc);
907 }
908
909 pg_t ec_pg(0, pool_37968);
910 pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
911 int from = -1;
912 int to = -1;
913 {
914 // insert a valid pg_upmap_item
915 vector<int> ec_up;
916 int ec_up_primary;
917 tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
918 ASSERT_TRUE(ec_up.size() == 4);
919 from = *(ec_up.begin());
920 ASSERT_TRUE(from >= 0);
921 auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
922 ASSERT_TRUE(parent < 0);
923 // pick an osd of the same parent with *from*
924 for (int i = 0; i < (int)get_num_osds(); i++) {
925 if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
926 auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
927 if (p == parent) {
928 to = i;
929 break;
930 }
931 }
932 }
933 ASSERT_TRUE(to >= 0);
934 ASSERT_TRUE(from != to);
935 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
936 new_pg_upmap_items.push_back(make_pair(from, to));
937 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
938 pending_inc.new_pg_upmap_items[ec_pgid] =
939 mempool::osdmap::vector<pair<int32_t,int32_t>>(
940 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
941 tmp.apply_incremental(pending_inc);
942 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
943 }
944 {
945 // *clean_pg_upmaps* should not remove the above upmap_item
946 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
947 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
948 tmp.apply_incremental(pending_inc);
949 ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
950 }
951 }
952
953 {
954 // TEST pg_upmap
955 {
956 // STEP-1: enumerate all children of up[0]'s parent,
957 // replace up[1] with one of them (other than up[0])
958 int parent = osdmap.crush->get_parent_of_type(up[0],
959 osdmap.crush->get_type_id("host"));
960 set<int> candidates;
961 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent), &candidates);
962 ASSERT_LT(1U, candidates.size());
963 int replaced_by = -1;
964 for (auto c: candidates) {
965 if (c != up[0]) {
966 replaced_by = c;
967 break;
968 }
969 }
970 {
971 // Check we can handle a negative pg_upmap value
972 vector<int32_t> new_pg_upmap;
973 new_pg_upmap.push_back(up[0]);
974 new_pg_upmap.push_back(-823648512);
975 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
976 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
977 new_pg_upmap.begin(), new_pg_upmap.end());
978 osdmap.apply_incremental(pending_inc);
979 vector<int> new_up;
980 int new_up_primary;
981 // crucial call - _apply_upmap should ignore the negative value
982 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
983 }
984 ASSERT_NE(-1, replaced_by);
985 // generate a new pg_upmap item and apply
986 vector<int32_t> new_pg_upmap;
987 new_pg_upmap.push_back(up[0]);
988 new_pg_upmap.push_back(replaced_by); // up[1] -> replaced_by
989 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
990 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
991 new_pg_upmap.begin(), new_pg_upmap.end());
992 osdmap.apply_incremental(pending_inc);
993 {
994 // validate pg_upmap is there
995 vector<int> new_up;
996 int new_up_primary;
997 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
998 ASSERT_EQ(new_up.size(), up.size());
999 ASSERT_EQ(new_up[0], new_pg_upmap[0]);
1000 ASSERT_EQ(new_up[1], new_pg_upmap[1]);
1001 // and we shall have two OSDs from a same host now..
1002 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
1003 osdmap.crush->get_type_id("host"));
1004 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
1005 osdmap.crush->get_type_id("host"));
1006 ASSERT_EQ(parent_0, parent_1);
1007 }
1008 }
1009 {
1010 // STEP-2: apply cure
1011 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1012 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1013 osdmap.apply_incremental(pending_inc);
1014 {
1015 // validate pg_upmap is gone (reverted)
1016 vector<int> new_up;
1017 int new_up_primary;
1018 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1019 ASSERT_EQ(new_up, up);
1020 ASSERT_EQ(new_up_primary, up_primary);
1021 }
1022 }
1023 }
1024
1025 {
1026 // TEST pg_upmap_items
1027 // enumerate all used hosts first
1028 set<int> parents;
1029 for (auto u: up) {
1030 int parent = osdmap.crush->get_parent_of_type(u,
1031 osdmap.crush->get_type_id("host"));
1032 ASSERT_GT(0, parent);
1033 parents.insert(parent);
1034 }
1035 int candidate_parent = 0;
1036 set<int> candidate_children;
1037 vector<int> up_after_out;
1038 {
1039 // STEP-1: try mark out up[1] and all other OSDs from the same host
1040 int parent = osdmap.crush->get_parent_of_type(up[1],
1041 osdmap.crush->get_type_id("host"));
1042 set<int> children;
1043 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1044 &children);
1045 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1046 for (auto c: children) {
1047 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1048 }
1049 OSDMap tmpmap;
1050 tmpmap.deepish_copy_from(osdmap);
1051 tmpmap.apply_incremental(pending_inc);
1052 vector<int> new_up;
1053 int new_up_primary;
1054 tmpmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1055 // verify that we'll have OSDs from a different host..
1056 int will_choose = -1;
1057 for (auto o: new_up) {
1058 int parent = tmpmap.crush->get_parent_of_type(o,
1059 osdmap.crush->get_type_id("host"));
1060 if (!parents.count(parent)) {
1061 will_choose = o;
1062 candidate_parent = parent; // record
1063 break;
1064 }
1065 }
1066 ASSERT_LT(-1, will_choose); // it is an OSD!
1067 ASSERT_NE(candidate_parent, 0);
1068 osdmap.crush->get_leaves(osdmap.crush->get_item_name(candidate_parent),
1069 &candidate_children);
1070 ASSERT_TRUE(candidate_children.count(will_choose));
1071 candidate_children.erase(will_choose);
1072 ASSERT_FALSE(candidate_children.empty());
1073 up_after_out = new_up; // needed for verification..
1074 }
1075 {
1076 // Make sure we can handle a negative pg_upmap_item
1077 int victim = up[0];
1078 int replaced_by = -823648512;
1079 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1080 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1081 // apply
1082 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1083 pending_inc.new_pg_upmap_items[pgid] =
1084 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1085 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1086 osdmap.apply_incremental(pending_inc);
1087 vector<int> new_up;
1088 int new_up_primary;
1089 // crucial call - _apply_upmap should ignore the negative value
1090 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1091 }
1092 {
1093 // STEP-2: generating a new pg_upmap_items entry by
1094 // replacing up[0] with one coming from candidate_children
1095 int victim = up[0];
1096 int replaced_by = *candidate_children.begin();
1097 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1098 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1099 // apply
1100 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1101 pending_inc.new_pg_upmap_items[pgid] =
1102 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1103 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1104 osdmap.apply_incremental(pending_inc);
1105 {
1106 // validate pg_upmap_items is there
1107 vector<int> new_up;
1108 int new_up_primary;
1109 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1110 ASSERT_EQ(new_up.size(), up.size());
1111 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), replaced_by) !=
1112 new_up.end());
1113 // and up[1] too
1114 ASSERT_TRUE(std::find(new_up.begin(), new_up.end(), up[1]) !=
1115 new_up.end());
1116 }
1117 }
1118 {
1119 // STEP-3: mark out up[1] and all other OSDs from the same host
1120 int parent = osdmap.crush->get_parent_of_type(up[1],
1121 osdmap.crush->get_type_id("host"));
1122 set<int> children;
1123 osdmap.crush->get_leaves(osdmap.crush->get_item_name(parent),
1124 &children);
1125 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1126 for (auto c: children) {
1127 pending_inc.new_weight[c] = CEPH_OSD_OUT;
1128 }
1129 osdmap.apply_incremental(pending_inc);
1130 {
1131 // validate we have two OSDs from the same host now..
1132 vector<int> new_up;
1133 int new_up_primary;
1134 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1135 ASSERT_EQ(up.size(), new_up.size());
1136 int parent_0 = osdmap.crush->get_parent_of_type(new_up[0],
1137 osdmap.crush->get_type_id("host"));
1138 int parent_1 = osdmap.crush->get_parent_of_type(new_up[1],
1139 osdmap.crush->get_type_id("host"));
1140 ASSERT_EQ(parent_0, parent_1);
1141 }
1142 }
1143 {
1144 // STEP-4: apply cure
1145 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1146 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1147 osdmap.apply_incremental(pending_inc);
1148 {
1149 // validate pg_upmap_items is gone (reverted)
1150 vector<int> new_up;
1151 int new_up_primary;
1152 osdmap.pg_to_raw_up(pgid, &new_up, &new_up_primary);
1153 ASSERT_EQ(new_up, up_after_out);
1154 }
1155 }
1156 }
1157 }
1158
1159 TEST_F(OSDMapTest, BUG_38897) {
1160 // http://tracker.ceph.com/issues/38897
1161 // build a fresh map with 12 OSDs, without any default pools
1162 set_up_map(12, true);
1163 const string pool_1("pool1");
1164 const string pool_2("pool2");
1165 int64_t pool_1_id = -1;
1166
1167 {
1168 // build customized crush rule for "pool1"
1169 string host_name = "host_for_pool_1";
1170 // build a customized host to capture osd.1~5
1171 for (int i = 1; i < 5; i++) {
1172 stringstream osd_name;
1173 vector<string> move_to;
1174 osd_name << "osd." << i;
1175 move_to.push_back("root=default");
1176 string host_loc = "host=" + host_name;
1177 move_to.push_back(host_loc);
1178 auto r = crush_move(osdmap, osd_name.str(), move_to);
1179 ASSERT_EQ(0, r);
1180 }
1181 CrushWrapper crush;
1182 get_crush(osdmap, crush);
1183 auto host_id = crush.get_item_id(host_name);
1184 ASSERT_TRUE(host_id < 0);
1185 string rule_name = "rule_for_pool1";
1186 int rule_type = pg_pool_t::TYPE_REPLICATED;
1187 ASSERT_TRUE(!crush.rule_exists(rule_name));
1188 int rno;
1189 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1190 if (!crush.rule_exists(rno))
1191 break;
1192 }
1193 int steps = 7;
1194 crush_rule *rule = crush_make_rule(steps, rule_type);
1195 int step = 0;
1196 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1197 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1198 // always choose osd.0
1199 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1200 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1201 // then pick any other random osds
1202 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1203 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1204 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1205 ASSERT_TRUE(step == steps);
1206 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1207 ASSERT_TRUE(r >= 0);
1208 crush.set_rule_name(rno, rule_name);
1209 {
1210 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1211 pending_inc.crush.clear();
1212 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1213 osdmap.apply_incremental(pending_inc);
1214 }
1215
1216 // create "pool1"
1217 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1218 pending_inc.new_pool_max = osdmap.get_pool_max();
1219 auto pool_id = ++pending_inc.new_pool_max;
1220 pool_1_id = pool_id;
1221 pg_pool_t empty;
1222 auto p = pending_inc.get_new_pool(pool_id, &empty);
1223 p->size = 3;
1224 p->min_size = 1;
1225 p->set_pg_num(3);
1226 p->set_pgp_num(3);
1227 p->type = pg_pool_t::TYPE_REPLICATED;
1228 p->crush_rule = rno;
1229 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1230 pending_inc.new_pool_names[pool_id] = pool_1;
1231 osdmap.apply_incremental(pending_inc);
1232 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1233 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
1234 {
1235 for (unsigned i = 0; i < 3; i++) {
1236 // 1.x -> [1]
1237 pg_t rawpg(i, pool_id);
1238 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1239 vector<int> up;
1240 int up_primary;
1241 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1242 ASSERT_TRUE(up.size() == 3);
1243 ASSERT_TRUE(up[0] == 0);
1244
1245 // insert a new pg_upmap
1246 vector<int32_t> new_up;
1247 // and remap 1.x to osd.1 only
1248 // this way osd.0 is deemed to be *underfull*
1249 // and osd.1 is deemed to be *overfull*
1250 new_up.push_back(1);
1251 {
1252 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1253 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1254 new_up.begin(), new_up.end());
1255 osdmap.apply_incremental(pending_inc);
1256 }
1257 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1258 ASSERT_TRUE(up.size() == 1);
1259 ASSERT_TRUE(up[0] == 1);
1260 }
1261 }
1262 }
1263
1264 {
1265 // build customized crush rule for "pool2"
1266 string host_name = "host_for_pool_2";
1267 // build a customized host to capture osd.6~11
1268 for (int i = 6; i < (int)get_num_osds(); i++) {
1269 stringstream osd_name;
1270 vector<string> move_to;
1271 osd_name << "osd." << i;
1272 move_to.push_back("root=default");
1273 string host_loc = "host=" + host_name;
1274 move_to.push_back(host_loc);
1275 auto r = crush_move(osdmap, osd_name.str(), move_to);
1276 ASSERT_EQ(0, r);
1277 }
1278 CrushWrapper crush;
1279 get_crush(osdmap, crush);
1280 auto host_id = crush.get_item_id(host_name);
1281 ASSERT_TRUE(host_id < 0);
1282 string rule_name = "rule_for_pool2";
1283 int rule_type = pg_pool_t::TYPE_REPLICATED;
1284 ASSERT_TRUE(!crush.rule_exists(rule_name));
1285 int rno;
1286 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1287 if (!crush.rule_exists(rno))
1288 break;
1289 }
1290 int steps = 7;
1291 crush_rule *rule = crush_make_rule(steps, rule_type);
1292 int step = 0;
1293 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1294 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1295 // always choose osd.0
1296 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1297 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1298 // then pick any other random osds
1299 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
1300 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
1301 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1302 ASSERT_TRUE(step == steps);
1303 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1304 ASSERT_TRUE(r >= 0);
1305 crush.set_rule_name(rno, rule_name);
1306 {
1307 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1308 pending_inc.crush.clear();
1309 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1310 osdmap.apply_incremental(pending_inc);
1311 }
1312
1313 // create "pool2"
1314 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1315 pending_inc.new_pool_max = osdmap.get_pool_max();
1316 auto pool_id = ++pending_inc.new_pool_max;
1317 pg_pool_t empty;
1318 auto p = pending_inc.get_new_pool(pool_id, &empty);
1319 p->size = 3;
1320 // include a single PG
1321 p->set_pg_num(1);
1322 p->set_pgp_num(1);
1323 p->type = pg_pool_t::TYPE_REPLICATED;
1324 p->crush_rule = rno;
1325 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1326 pending_inc.new_pool_names[pool_id] = pool_2;
1327 osdmap.apply_incremental(pending_inc);
1328 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1329 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
1330 pg_t rawpg(0, pool_id);
1331 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1332 EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
1333 vector<int> up;
1334 int up_primary;
1335 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1336 ASSERT_TRUE(up.size() == 3);
1337 ASSERT_TRUE(up[0] == 0);
1338
1339 {
1340 // build a pg_upmap_item that will
1341 // remap pg out from *underfull* osd.0
1342 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1343 new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
1344 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1345 pending_inc.new_pg_upmap_items[pgid] =
1346 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1347 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1348 osdmap.apply_incremental(pending_inc);
1349 ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
1350 vector<int> up;
1351 int up_primary;
1352 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1353 ASSERT_TRUE(up.size() == 3);
1354 ASSERT_TRUE(up[0] == 10);
1355 }
1356 }
1357
1358 // ready to go
1359 {
1360 set<int64_t> only_pools;
1361 ASSERT_TRUE(pool_1_id >= 0);
1362 only_pools.insert(pool_1_id);
1363 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1364 // require perfect distribution! (max deviation 0)
1365 osdmap.calc_pg_upmaps(g_ceph_context,
1366 0, // so we can force optimizing
1367 100,
1368 only_pools,
1369 &pending_inc);
1370 osdmap.apply_incremental(pending_inc);
1371 }
1372 }
1373
1374 TEST_F(OSDMapTest, BUG_40104) {
1375 // http://tracker.ceph.com/issues/40104
1376 int big_osd_num = 5000;
1377 int big_pg_num = 10000;
1378 set_up_map(big_osd_num, true);
1379 int pool_id;
1380 {
1381 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1382 pending_inc.new_pool_max = osdmap.get_pool_max();
1383 pool_id = ++pending_inc.new_pool_max;
1384 pg_pool_t empty;
1385 auto p = pending_inc.get_new_pool(pool_id, &empty);
1386 p->size = 3;
1387 p->min_size = 1;
1388 p->set_pg_num(big_pg_num);
1389 p->set_pgp_num(big_pg_num);
1390 p->type = pg_pool_t::TYPE_REPLICATED;
1391 p->crush_rule = 0;
1392 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1393 pending_inc.new_pool_names[pool_id] = "big_pool";
1394 osdmap.apply_incremental(pending_inc);
1395 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1396 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool");
1397 }
1398 {
1399 // generate pg_upmap_items for each pg
1400 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1401 for (int i = 0; i < big_pg_num; i++) {
1402 pg_t rawpg(i, pool_id);
1403 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1404 vector<int> up;
1405 int up_primary;
1406 osdmap.pg_to_raw_up(pgid, &up, &up_primary);
1407 ASSERT_TRUE(up.size() == 3);
1408 int victim = up[0];
1409 int replaced_by = random() % big_osd_num;
1410 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1411 // note that it might or might not be valid, we don't care
1412 new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
1413 pending_inc.new_pg_upmap_items[pgid] =
1414 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1415 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1416 }
1417 osdmap.apply_incremental(pending_inc);
1418 }
1419 {
1420 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1421 auto start = mono_clock::now();
1422 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1423 auto latency = mono_clock::now() - start;
1424 std::cout << "clean_pg_upmaps (~" << big_pg_num
1425 << " pg_upmap_items) latency:" << timespan_str(latency)
1426 << std::endl;
1427 }
1428 }
1429
1430 TEST_F(OSDMapTest, BUG_42052) {
1431 // https://tracker.ceph.com/issues/42052
1432 set_up_map(6, true);
1433 const string pool_name("pool");
1434 // build customized crush rule for "pool"
1435 CrushWrapper crush;
1436 get_crush(osdmap, crush);
1437 string rule_name = "rule";
1438 int rule_type = pg_pool_t::TYPE_REPLICATED;
1439 ASSERT_TRUE(!crush.rule_exists(rule_name));
1440 int rno;
1441 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1442 if (!crush.rule_exists(rno))
1443 break;
1444 }
1445 int steps = 8;
1446 crush_rule *rule = crush_make_rule(steps, rule_type);
1447 int step = 0;
1448 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1449 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1450 // always choose osd.0, osd.1, osd.2
1451 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
1452 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1453 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 1);
1454 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1455 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 2);
1456 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1457 ASSERT_TRUE(step == steps);
1458 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1459 ASSERT_TRUE(r >= 0);
1460 crush.set_rule_name(rno, rule_name);
1461 {
1462 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1463 pending_inc.crush.clear();
1464 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1465 osdmap.apply_incremental(pending_inc);
1466 }
1467
1468 // create "pool"
1469 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1470 pending_inc.new_pool_max = osdmap.get_pool_max();
1471 auto pool_id = ++pending_inc.new_pool_max;
1472 pg_pool_t empty;
1473 auto p = pending_inc.get_new_pool(pool_id, &empty);
1474 p->size = 3;
1475 p->min_size = 1;
1476 p->set_pg_num(1);
1477 p->set_pgp_num(1);
1478 p->type = pg_pool_t::TYPE_REPLICATED;
1479 p->crush_rule = rno;
1480 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1481 pending_inc.new_pool_names[pool_id] = pool_name;
1482 osdmap.apply_incremental(pending_inc);
1483 ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
1484 ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_name);
1485 pg_t rawpg(0, pool_id);
1486 pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
1487 {
1488 // pg_upmap 1.0 [2,3,5]
1489 vector<int32_t> new_up{2,3,5};
1490 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1491 pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
1492 new_up.begin(), new_up.end());
1493 osdmap.apply_incremental(pending_inc);
1494 }
1495 {
1496 // pg_upmap_items 1.0 [0,3,4,5]
1497 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1498 new_pg_upmap_items.push_back(make_pair(0, 3));
1499 new_pg_upmap_items.push_back(make_pair(4, 5));
1500 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1501 pending_inc.new_pg_upmap_items[pgid] =
1502 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1503 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1504 osdmap.apply_incremental(pending_inc);
1505 }
1506 {
1507 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1508 clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
1509 osdmap.apply_incremental(pending_inc);
1510 ASSERT_FALSE(osdmap.have_pg_upmaps(pgid));
1511 }
1512 }
1513
1514 TEST_F(OSDMapTest, BUG_42485) {
1515 set_up_map(60);
1516 {
1517 // build a temporary crush topology of 2datacenters, 3racks per dc,
1518 // 1host per rack, 10osds per host
1519 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
1520 tmp.deepish_copy_from(osdmap);
1521 const int expected_host_num = 6;
1522 int osd_per_host = (int)get_num_osds() / expected_host_num;
1523 ASSERT_GE(osd_per_host, 10);
1524 int host_per_dc = 3;
1525 int index = 0;
1526 int dc_index = 0;
1527 for (int i = 0; i < (int)get_num_osds(); i++) {
1528 if (i && i % osd_per_host == 0) {
1529 ++index;
1530 }
1531 if (i && i % (host_per_dc * osd_per_host) == 0) {
1532 ++dc_index;
1533 }
1534 stringstream osd_name;
1535 stringstream host_name;
1536 stringstream rack_name;
1537 stringstream dc_name;
1538 vector<string> move_to;
1539 osd_name << "osd." << i;
1540 host_name << "host-" << index;
1541 rack_name << "rack-" << index;
1542 dc_name << "dc-" << dc_index;
1543 move_to.push_back("root=default");
1544 string dc_loc = "datacenter=" + dc_name.str();
1545 move_to.push_back(dc_loc);
1546 string rack_loc = "rack=" + rack_name.str();
1547 move_to.push_back(rack_loc);
1548 string host_loc = "host=" + host_name.str();
1549 move_to.push_back(host_loc);
1550 auto r = crush_move(tmp, osd_name.str(), move_to);
1551 ASSERT_EQ(0, r);
1552 }
1553
1554 // build crush rule
1555 CrushWrapper crush;
1556 get_crush(tmp, crush);
1557 string rule_name = "rule_xeus_993_1";
1558 int rule_type = pg_pool_t::TYPE_REPLICATED;
1559 ASSERT_TRUE(!crush.rule_exists(rule_name));
1560 int rno;
1561 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1562 if (!crush.rule_exists(rno))
1563 break;
1564 }
1565 string root_name = "default";
1566 string dc_1 = "dc-0";
1567 int dc1 = crush.get_item_id(dc_1);
1568 string dc_2 = "dc-1";
1569 int dc2 = crush.get_item_id(dc_2);
1570 int steps = 8;
1571 crush_rule *rule = crush_make_rule(steps, rule_type);
1572 int step = 0;
1573 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1574 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1575 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc1, 0);
1576 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */);
1577 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1578 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, dc2, 0);
1579 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 3 /* rack */);
1580 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1581 ASSERT_TRUE(step == steps);
1582 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1583 ASSERT_TRUE(r >= 0);
1584 crush.set_rule_name(rno, rule_name);
1585 {
1586 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1587 pending_inc.crush.clear();
1588 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1589 tmp.apply_incremental(pending_inc);
1590 }
1591 // create a repliacted pool referencing the above rule
1592 int64_t pool_xeus_993;
1593 {
1594 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
1595 new_pool_inc.new_pool_max = tmp.get_pool_max();
1596 new_pool_inc.fsid = tmp.get_fsid();
1597 pg_pool_t empty;
1598 pool_xeus_993 = ++new_pool_inc.new_pool_max;
1599 pg_pool_t *p = new_pool_inc.get_new_pool(pool_xeus_993, &empty);
1600 p->size = 4;
1601 p->set_pg_num(4096);
1602 p->set_pgp_num(4096);
1603 p->type = pg_pool_t::TYPE_REPLICATED;
1604 p->crush_rule = rno;
1605 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1606 new_pool_inc.new_pool_names[pool_xeus_993] = "pool_xeus_993";
1607 tmp.apply_incremental(new_pool_inc);
1608 }
1609
1610 pg_t rep_pg(0, pool_xeus_993);
1611 pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg);
1612 {
1613 int from = -1;
1614 int to = -1;
1615 vector<int> rep_up;
1616 int rep_up_primary;
1617 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1618 std::cout << "pgid " << rep_up << " up " << rep_up << std::endl;
1619 ASSERT_TRUE(rep_up.size() == 4);
1620 from = *(rep_up.begin());
1621 ASSERT_TRUE(from >= 0);
1622 auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno);
1623 if (dc_parent == dc1)
1624 dc_parent = dc2;
1625 else
1626 dc_parent = dc1;
1627 auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1628 ASSERT_TRUE(dc_parent < 0);
1629 ASSERT_TRUE(rack_parent < 0);
1630 set<int> rack_parents;
1631 for (auto &i: rep_up) {
1632 if (i == from) continue;
1633 auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1634 rack_parents.insert(rack_parent);
1635 }
1636 for (int i = 0; i < (int)get_num_osds(); i++) {
1637 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1638 auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno);
1639 auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1640 if (dc_p == dc_parent &&
1641 rack_parents.find(rack_p) == rack_parents.end()) {
1642 to = i;
1643 break;
1644 }
1645 }
1646 }
1647 ASSERT_TRUE(to >= 0);
1648 ASSERT_TRUE(from != to);
1649 std::cout << "from " << from << " to " << to << std::endl;
1650 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1651 new_pg_upmap_items.push_back(make_pair(from, to));
1652 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1653 pending_inc.new_pg_upmap_items[rep_pgid] =
1654 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1655 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1656 tmp.apply_incremental(pending_inc);
1657 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1658 }
1659 pg_t rep_pg2(2, pool_xeus_993);
1660 pg_t rep_pgid2 = tmp.raw_pg_to_pg(rep_pg2);
1661 {
1662 pg_t rep_pgid = rep_pgid2;
1663 vector<int> from_osds{-1, -1};
1664 vector<int> rep_up;
1665 int rep_up_primary;
1666 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1667 ASSERT_TRUE(rep_up.size() == 4);
1668 from_osds[0] = *(rep_up.begin());
1669 from_osds[1] = *(rep_up.rbegin());
1670 std::cout << "pgid " << rep_pgid2 << " up " << rep_up << std::endl;
1671 ASSERT_TRUE(*(from_osds.begin()) >= 0);
1672 ASSERT_TRUE(*(from_osds.rbegin()) >= 0);
1673 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1674 for (auto &from: from_osds) {
1675 int to = -1;
1676 auto dc_parent = tmp.crush->get_parent_of_type(from, 8 /* dc */, rno);
1677 if (dc_parent == dc1)
1678 dc_parent = dc2;
1679 else
1680 dc_parent = dc1;
1681 auto rack_parent = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1682 ASSERT_TRUE(dc_parent < 0);
1683 ASSERT_TRUE(rack_parent < 0);
1684 set<int> rack_parents;
1685 for (auto &i: rep_up) {
1686 if (i == from) continue;
1687 auto rack_parent = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1688 rack_parents.insert(rack_parent);
1689 }
1690 for (auto &i: new_pg_upmap_items) {
1691 auto rack_from = tmp.crush->get_parent_of_type(i.first, 3, rno);
1692 auto rack_to = tmp.crush->get_parent_of_type(i.second, 3, rno);
1693 rack_parents.insert(rack_from);
1694 rack_parents.insert(rack_to);
1695 }
1696 for (int i = 0; i < (int)get_num_osds(); i++) {
1697 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1698 auto dc_p = tmp.crush->get_parent_of_type(i, 8 /* dc */, rno);
1699 auto rack_p = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1700 if (dc_p == dc_parent &&
1701 rack_parents.find(rack_p) == rack_parents.end()) {
1702 to = i;
1703 break;
1704 }
1705 }
1706 }
1707 ASSERT_TRUE(to >= 0);
1708 ASSERT_TRUE(from != to);
1709 std::cout << "from " << from << " to " << to << std::endl;
1710 new_pg_upmap_items.push_back(make_pair(from, to));
1711 }
1712 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1713 pending_inc.new_pg_upmap_items[rep_pgid] =
1714 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1715 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1716 tmp.apply_incremental(pending_inc);
1717 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1718 }
1719 {
1720 // *maybe_remove_pg_upmaps* should remove the above upmap_item
1721 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1722 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
1723 tmp.apply_incremental(pending_inc);
1724 ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid));
1725 ASSERT_FALSE(tmp.have_pg_upmaps(rep_pgid2));
1726 }
1727 }
1728 }
1729
1730 TEST(PGTempMap, basic)
1731 {
1732 PGTempMap m;
1733 pg_t a(1,1);
1734 for (auto i=3; i<1000; ++i) {
1735 pg_t x(i, 1);
1736 m.set(x, {static_cast<int>(i)});
1737 }
1738 pg_t b(2,1);
1739 m.set(a, {1, 2});
1740 ASSERT_NE(m.find(a), m.end());
1741 ASSERT_EQ(m.find(a), m.begin());
1742 ASSERT_EQ(m.find(b), m.end());
1743 ASSERT_EQ(998u, m.size());
1744 }
1745
1746 TEST_F(OSDMapTest, BUG_43124) {
1747 set_up_map(200);
1748 {
1749 // https://tracker.ceph.com/issues/43124
1750
1751 // build a temporary crush topology of 5racks,
1752 // 4 hosts per rack, 10osds per host
1753 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
1754 tmp.deepish_copy_from(osdmap);
1755 const int expected_host_num = 20;
1756 int osd_per_host = (int)get_num_osds() / expected_host_num;
1757 ASSERT_GE(osd_per_host, 10);
1758 int host_per_rack = 4;
1759 int index = 0;
1760 int rack_index = 0;
1761 for (int i = 0; i < (int)get_num_osds(); i++) {
1762 if (i && i % osd_per_host == 0) {
1763 ++index;
1764 }
1765 if (i && i % (host_per_rack * osd_per_host) == 0) {
1766 ++rack_index;
1767 }
1768 stringstream osd_name;
1769 stringstream host_name;
1770 stringstream rack_name;
1771 vector<string> move_to;
1772 osd_name << "osd." << i;
1773 host_name << "host-" << index;
1774 rack_name << "rack-" << rack_index;
1775 move_to.push_back("root=default");
1776 string rack_loc = "rack=" + rack_name.str();
1777 move_to.push_back(rack_loc);
1778 string host_loc = "host=" + host_name.str();
1779 move_to.push_back(host_loc);
1780 auto r = crush_move(tmp, osd_name.str(), move_to);
1781 ASSERT_EQ(0, r);
1782 }
1783
1784 // build crush rule
1785 CrushWrapper crush;
1786 get_crush(tmp, crush);
1787 string rule_name = "rule_angel_1944";
1788 int rule_type = pg_pool_t::TYPE_ERASURE;
1789 ASSERT_TRUE(!crush.rule_exists(rule_name));
1790 int rno;
1791 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1792 if (!crush.rule_exists(rno))
1793 break;
1794 }
1795 int steps = 6;
1796 string root_name = "default";
1797 int root = crush.get_item_id(root_name);
1798 crush_rule *rule = crush_make_rule(steps, rule_type);
1799 int step = 0;
1800 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
1801 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
1802 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
1803 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_FIRSTN, 4, 3 /* rack */);
1804 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_INDEP, 3, 1 /* host */);
1805 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
1806 ASSERT_TRUE(step == steps);
1807 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
1808 ASSERT_TRUE(r >= 0);
1809 crush.set_rule_name(rno, rule_name);
1810 {
1811 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1812 pending_inc.crush.clear();
1813 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1814 tmp.apply_incremental(pending_inc);
1815 }
1816 {
1817 stringstream oss;
1818 crush.dump_tree(&oss, NULL);
1819 std::cout << oss.str() << std::endl;
1820 Formatter *f = Formatter::create("json-pretty");
1821 f->open_object_section("crush_rules");
1822 crush.dump_rules(f);
1823 f->close_section();
1824 f->flush(cout);
1825 delete f;
1826 }
1827 // create a erasuce-coded pool referencing the above rule
1828 int64_t pool_angel_1944;
1829 {
1830 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
1831 new_pool_inc.new_pool_max = tmp.get_pool_max();
1832 new_pool_inc.fsid = tmp.get_fsid();
1833 pg_pool_t empty;
1834 pool_angel_1944 = ++new_pool_inc.new_pool_max;
1835 pg_pool_t *p = new_pool_inc.get_new_pool(pool_angel_1944, &empty);
1836 p->size = 12;
1837 p->set_pg_num(4096);
1838 p->set_pgp_num(4096);
1839 p->type = pg_pool_t::TYPE_ERASURE;
1840 p->crush_rule = rno;
1841 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
1842 new_pool_inc.new_pool_names[pool_angel_1944] = "pool_angel_1944";
1843 tmp.apply_incremental(new_pool_inc);
1844 }
1845
1846 pg_t rep_pg(0, pool_angel_1944);
1847 pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg);
1848 {
1849 // insert a pg_upmap_item
1850 int from = -1;
1851 int to = -1;
1852 vector<int> rep_up;
1853 int rep_up_primary;
1854 tmp.pg_to_raw_up(rep_pgid, &rep_up, &rep_up_primary);
1855 std::cout << "pgid " << rep_pgid << " up " << rep_up << std::endl;
1856 ASSERT_TRUE(rep_up.size() == 12);
1857 from = *(rep_up.begin());
1858 ASSERT_TRUE(from >= 0);
1859 auto from_rack = tmp.crush->get_parent_of_type(from, 3 /* rack */, rno);
1860 set<int> failure_domains;
1861 for (auto &osd : rep_up) {
1862 failure_domains.insert(tmp.crush->get_parent_of_type(osd, 1 /* host */, rno));
1863 }
1864 for (int i = 0; i < (int)get_num_osds(); i++) {
1865 if (std::find(rep_up.begin(), rep_up.end(), i) == rep_up.end()) {
1866 auto to_rack = tmp.crush->get_parent_of_type(i, 3 /* rack */, rno);
1867 auto to_host = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
1868 if (to_rack != from_rack && failure_domains.count(to_host) == 0) {
1869 to = i;
1870 break;
1871 }
1872 }
1873 }
1874 ASSERT_TRUE(to >= 0);
1875 ASSERT_TRUE(from != to);
1876 std::cout << "from " << from << " to " << to << std::endl;
1877 vector<pair<int32_t,int32_t>> new_pg_upmap_items;
1878 new_pg_upmap_items.push_back(make_pair(from, to));
1879 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1880 pending_inc.new_pg_upmap_items[rep_pgid] =
1881 mempool::osdmap::vector<pair<int32_t,int32_t>>(
1882 new_pg_upmap_items.begin(), new_pg_upmap_items.end());
1883 tmp.apply_incremental(pending_inc);
1884 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1885 }
1886 {
1887 // *maybe_remove_pg_upmaps* should not remove the above upmap_item
1888 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
1889 clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
1890 tmp.apply_incremental(pending_inc);
1891 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
1892 }
1893 }
1894 }
1895
1896 TEST_F(OSDMapTest, BUG_48884)
1897 {
1898
1899 set_up_map(12);
1900
1901 unsigned int host_index = 1;
1902 for (unsigned int x=0; x < get_num_osds();) {
1903 // Create three hosts with four osds each
1904 for (unsigned int y=0; y < 4; y++) {
1905 stringstream osd_name;
1906 stringstream host_name;
1907 vector<string> move_to;
1908 osd_name << "osd." << x;
1909 host_name << "host-" << host_index;
1910 move_to.push_back("root=default");
1911 move_to.push_back("rack=localrack");
1912 string host_loc = "host=" + host_name.str();
1913 move_to.push_back(host_loc);
1914 int r = crush_move(osdmap, osd_name.str(), move_to);
1915 ASSERT_EQ(0, r);
1916 x++;
1917 }
1918 host_index++;
1919 }
1920
1921 CrushWrapper crush;
1922 get_crush(osdmap, crush);
1923 auto host_id = crush.get_item_id("localhost");
1924 crush.remove_item(g_ceph_context, host_id, false);
1925 OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
1926 pending_inc.crush.clear();
1927 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
1928 osdmap.apply_incremental(pending_inc);
1929
1930 PGMap pgmap;
1931 osd_stat_t stats, stats_null;
1932 stats.statfs.total = 500000;
1933 stats.statfs.available = 50000;
1934 stats.statfs.omap_allocated = 50000;
1935 stats.statfs.internal_metadata = 50000;
1936 stats_null.statfs.total = 0;
1937 stats_null.statfs.available = 0;
1938 stats_null.statfs.omap_allocated = 0;
1939 stats_null.statfs.internal_metadata = 0;
1940 for (unsigned int x=0; x < get_num_osds(); x++) {
1941 if (x > 3 && x < 8) {
1942 pgmap.osd_stat.insert({x,stats_null});
1943 } else {
1944 pgmap.osd_stat.insert({x,stats});
1945 }
1946 }
1947
1948 stringstream ss;
1949 boost::scoped_ptr<Formatter> f(Formatter::create("json-pretty"));
1950 print_osd_utilization(osdmap, pgmap, ss, f.get(), true, "root");
1951 JSONParser parser;
1952 parser.parse(ss.str().c_str(), static_cast<int>(ss.str().size()));
1953 auto iter = parser.find_first();
1954 for (const auto& bucket : (*iter)->get_array_elements()) {
1955 JSONParser parser2;
1956 parser2.parse(bucket.c_str(), static_cast<int>(bucket.size()));
1957 auto* obj = parser2.find_obj("name");
1958 if (obj->get_data().compare("localrack") == 0) {
1959 obj = parser2.find_obj("kb");
1960 ASSERT_EQ(obj->get_data(), "3904");
1961 obj = parser2.find_obj("kb_used");
1962 ASSERT_EQ(obj->get_data(), "3512");
1963 obj = parser2.find_obj("kb_used_omap");
1964 ASSERT_EQ(obj->get_data(), "384");
1965 obj = parser2.find_obj("kb_used_meta");
1966 ASSERT_EQ(obj->get_data(), "384");
1967 obj = parser2.find_obj("kb_avail");
1968 ASSERT_EQ(obj->get_data(), "384");
1969 }
1970 }
1971 }
1972
1973 TEST_P(OSDMapTest, BUG_51842) {
1974 set_up_map(3, true);
1975 OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
1976 tmp.deepish_copy_from(osdmap);
1977 for (int i = 0; i < (int)get_num_osds(); i++) {
1978 stringstream osd_name;
1979 stringstream host_name;
1980 vector<string> move_to;
1981 osd_name << "osd." << i;
1982 host_name << "host=host-" << i;
1983 move_to.push_back("root=infra-1706");
1984 move_to.push_back(host_name.str());
1985 auto r = crush_move(tmp, osd_name.str(), move_to);
1986 ASSERT_EQ(0, r);
1987 }
1988
1989 // build crush rule
1990 CrushWrapper crush;
1991 get_crush(tmp, crush);
1992 string rule_name = "infra-1706";
1993 int rule_type = pg_pool_t::TYPE_REPLICATED;
1994 ASSERT_TRUE(!crush.rule_exists(rule_name));
1995 int rno;
1996 for (rno = 0; rno < crush.get_max_rules(); rno++) {
1997 if (!crush.rule_exists(rno))
1998 break;
1999 }
2000 string root_bucket = "infra-1706";
2001 int root = crush.get_item_id(root_bucket);
2002 int steps = 5;
2003 crush_rule *rule = crush_make_rule(steps, rule_type);
2004 int step = 0;
2005 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
2006 crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
2007 crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
2008 // note: it's ok to set like 'step chooseleaf_firstn 0 host'
2009 std::pair<int, int> param = GetParam();
2010 int rep_num = std::get<0>(param);
2011 int domain = std::get<1>(param);
2012 crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, rep_num, domain);
2013 crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
2014 ASSERT_TRUE(step == steps);
2015 auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
2016 ASSERT_TRUE(r >= 0);
2017 crush.set_rule_name(rno, rule_name);
2018 {
2019 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
2020 pending_inc.crush.clear();
2021 crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
2022 tmp.apply_incremental(pending_inc);
2023 }
2024 {
2025 stringstream oss;
2026 crush.dump_tree(&oss, NULL);
2027 std::cout << oss.str() << std::endl;
2028 Formatter *f = Formatter::create("json-pretty");
2029 f->open_object_section("crush_rules");
2030 crush.dump_rules(f);
2031 f->close_section();
2032 f->flush(cout);
2033 delete f;
2034 }
2035 // create a replicated pool referencing the above rule
2036 int64_t pool_infra_1706;
2037 {
2038 OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
2039 new_pool_inc.new_pool_max = tmp.get_pool_max();
2040 new_pool_inc.fsid = tmp.get_fsid();
2041 pg_pool_t empty;
2042 pool_infra_1706 = ++new_pool_inc.new_pool_max;
2043 pg_pool_t *p = new_pool_inc.get_new_pool(pool_infra_1706, &empty);
2044 p->size = 3;
2045 p->min_size = 1;
2046 p->set_pg_num(256);
2047 p->set_pgp_num(256);
2048 p->type = pg_pool_t::TYPE_REPLICATED;
2049 p->crush_rule = rno;
2050 p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
2051 new_pool_inc.new_pool_names[pool_infra_1706] = "pool_infra_1706";
2052 tmp.apply_incremental(new_pool_inc);
2053 }
2054
2055 // add upmaps
2056 pg_t rep_pg(3, pool_infra_1706);
2057 pg_t rep_pgid = tmp.raw_pg_to_pg(rep_pg);
2058 pg_t rep_pg2(4, pool_infra_1706);
2059 pg_t rep_pgid2 = tmp.raw_pg_to_pg(rep_pg2);
2060 pg_t rep_pg3(6, pool_infra_1706);
2061 pg_t rep_pgid3 = tmp.raw_pg_to_pg(rep_pg3);
2062 {
2063 OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
2064 pending_inc.new_pg_upmap[rep_pgid] = mempool::osdmap::vector<int32_t>({1,0,2});
2065 pending_inc.new_pg_upmap[rep_pgid2] = mempool::osdmap::vector<int32_t>({1,2,0});
2066 pending_inc.new_pg_upmap[rep_pgid3] = mempool::osdmap::vector<int32_t>({1,2,0});
2067 tmp.apply_incremental(pending_inc);
2068 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid));
2069 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid2));
2070 ASSERT_TRUE(tmp.have_pg_upmaps(rep_pgid3));
2071 }
2072
2073 {
2074 // now, set pool size to 1
2075 OSDMap tmpmap;
2076 tmpmap.deepish_copy_from(tmp);
2077 OSDMap::Incremental new_pool_inc(tmpmap.get_epoch() + 1);
2078 pg_pool_t p = *tmpmap.get_pg_pool(pool_infra_1706);
2079 p.size = 1;
2080 p.last_change = new_pool_inc.epoch;
2081 new_pool_inc.new_pools[pool_infra_1706] = p;
2082 tmpmap.apply_incremental(new_pool_inc);
2083
2084 OSDMap::Incremental new_pending_inc(tmpmap.get_epoch() + 1);
2085 clean_pg_upmaps(g_ceph_context, tmpmap, new_pending_inc);
2086 tmpmap.apply_incremental(new_pending_inc);
2087 // check pg upmaps
2088 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid));
2089 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid2));
2090 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid3));
2091 }
2092 {
2093 // now, set pool size to 4
2094 OSDMap tmpmap;
2095 tmpmap.deepish_copy_from(tmp);
2096 OSDMap::Incremental new_pool_inc(tmpmap.get_epoch() + 1);
2097 pg_pool_t p = *tmpmap.get_pg_pool(pool_infra_1706);
2098 p.size = 4;
2099 p.last_change = new_pool_inc.epoch;
2100 new_pool_inc.new_pools[pool_infra_1706] = p;
2101 tmpmap.apply_incremental(new_pool_inc);
2102
2103 OSDMap::Incremental new_pending_inc(tmpmap.get_epoch() + 1);
2104 clean_pg_upmaps(g_ceph_context, tmpmap, new_pending_inc);
2105 tmpmap.apply_incremental(new_pending_inc);
2106 // check pg upmaps
2107 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid));
2108 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid2));
2109 ASSERT_TRUE(!tmpmap.have_pg_upmaps(rep_pgid3));
2110 }
2111 }
2112
2113 const string OSDMapTest::range_addrs[] = {"198.51.100.0/22", "10.2.5.102/32", "2001:db8::/48",
2114 "3001:db8::/72", "4001:db8::/30", "5001:db8::/64", "6001:db8::/128", "7001:db8::/127"};
2115 const string OSDMapTest::ip_addrs[] = {"198.51.100.14", "198.51.100.0", "198.51.103.255",
2116 "10.2.5.102",
2117 "2001:db8:0:0:0:0:0:0", "2001:db8:0:0:0:0001:ffff:ffff",
2118 "2001:db8:0:ffff:ffff:ffff:ffff:ffff",
2119 "3001:db8:0:0:0:0:0:0", "3001:db8:0:0:0:0001:ffff:ffff",
2120 "3001:db8:0:0:00ff:ffff:ffff:ffff",
2121 "4001:db8::", "4001:db8:0:0:0:0001:ffff:ffff",
2122 "4001:dbb:ffff:ffff:ffff:ffff:ffff:ffff",
2123 "5001:db8:0:0:0:0:0:0", "5001:db8:0:0:0:0:ffff:ffff",
2124 "5001:db8:0:0:ffff:ffff:ffff:ffff",
2125 "6001:db8:0:0:0:0:0:0",
2126 "7001:db8:0:0:0:0:0:0", "7001:db8:0:0:0:0:0:0001"
2127 };
2128 const string OSDMapTest::unblocked_ip_addrs[] = { "0.0.0.0", "1.1.1.1", "192.168.1.1",
2129 "198.51.99.255", "198.51.104.0",
2130 "10.2.5.101", "10.2.5.103",
2131 "2001:db7:ffff:ffff:ffff:ffff:ffff:ffff", "2001:db8:0001::",
2132 "3001:db7:ffff:ffff:ffff:ffff:ffff:ffff", "3001:db8:0:0:0100::",
2133 "4001:db7:ffff:ffff:ffff:ffff:ffff:ffff", "4001:dbc::",
2134 "5001:db7:ffff:ffff:ffff:ffff:ffff:ffff", "5001:db8:0:0001:0:0:0:0",
2135 "6001:db8:0:0:0:0:0:0001",
2136 "7001:db7:ffff:ffff:ffff:ffff:ffff:ffff", "7001:db8:0:0:0:0:0:0002"
2137 };
2138
2139 TEST_F(OSDMapTest, blocklisting_ips) {
2140 set_up_map(6); //whatever
2141
2142 OSDMap::Incremental new_blocklist_inc(osdmap.get_epoch() + 1);
2143 for (const auto& a : ip_addrs) {
2144 entity_addr_t addr;
2145 addr.parse(a);
2146 addr.set_type(entity_addr_t::TYPE_LEGACY);
2147 new_blocklist_inc.new_blocklist[addr] = ceph_clock_now();
2148 }
2149 osdmap.apply_incremental(new_blocklist_inc);
2150
2151 for (const auto& a: ip_addrs) {
2152 entity_addr_t addr;
2153 addr.parse(a);
2154 addr.set_type(entity_addr_t::TYPE_LEGACY);
2155 ASSERT_TRUE(osdmap.is_blocklisted(addr, g_ceph_context));
2156 }
2157 for (const auto& a: unblocked_ip_addrs) {
2158 entity_addr_t addr;
2159 addr.parse(a);
2160 addr.set_type(entity_addr_t::TYPE_LEGACY);
2161 ASSERT_FALSE(osdmap.is_blocklisted(addr, g_ceph_context));
2162 }
2163
2164 OSDMap::Incremental rm_blocklist_inc(osdmap.get_epoch() + 1);
2165 for (const auto& a : ip_addrs) {
2166 entity_addr_t addr;
2167 addr.parse(a);
2168 addr.set_type(entity_addr_t::TYPE_LEGACY);
2169 rm_blocklist_inc.old_blocklist.push_back(addr);
2170 }
2171 osdmap.apply_incremental(rm_blocklist_inc);
2172 for (const auto& a: ip_addrs) {
2173 entity_addr_t addr;
2174 addr.parse(a);
2175 addr.set_type(entity_addr_t::TYPE_LEGACY);
2176 ASSERT_FALSE(osdmap.is_blocklisted(addr, g_ceph_context));
2177 }
2178 for (const auto& a: unblocked_ip_addrs) {
2179 entity_addr_t addr;
2180 addr.parse(a);
2181 addr.set_type(entity_addr_t::TYPE_LEGACY);
2182 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2183 if (blocklisted) {
2184 cout << "erroneously blocklisted " << addr << std::endl;
2185 }
2186 EXPECT_FALSE(blocklisted);
2187 }
2188 }
2189
2190 TEST_F(OSDMapTest, blocklisting_ranges) {
2191 set_up_map(6); //whatever
2192 OSDMap::Incremental range_blocklist_inc(osdmap.get_epoch() + 1);
2193 for (const auto& a : range_addrs) {
2194 entity_addr_t addr;
2195 addr.parse(a);
2196 addr.type = entity_addr_t::TYPE_CIDR;
2197 range_blocklist_inc.new_range_blocklist[addr] = ceph_clock_now();
2198 }
2199 osdmap.apply_incremental(range_blocklist_inc);
2200
2201 for (const auto& a: ip_addrs) {
2202 entity_addr_t addr;
2203 addr.parse(a);
2204 addr.set_type(entity_addr_t::TYPE_LEGACY);
2205 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2206 if (!blocklisted) {
2207 cout << "erroneously not blocklisted " << addr << std::endl;
2208 }
2209 ASSERT_TRUE(blocklisted);
2210 }
2211 for (const auto& a: unblocked_ip_addrs) {
2212 entity_addr_t addr;
2213 addr.parse(a);
2214 addr.set_type(entity_addr_t::TYPE_LEGACY);
2215 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2216 if (blocklisted) {
2217 cout << "erroneously blocklisted " << addr << std::endl;
2218 }
2219 EXPECT_FALSE(blocklisted);
2220 }
2221
2222 OSDMap::Incremental rm_range_blocklist(osdmap.get_epoch() + 1);
2223 for (const auto& a : range_addrs) {
2224 entity_addr_t addr;
2225 addr.parse(a);
2226 addr.type = entity_addr_t::TYPE_CIDR;
2227 rm_range_blocklist.old_range_blocklist.push_back(addr);
2228 }
2229 osdmap.apply_incremental(rm_range_blocklist);
2230
2231 for (const auto& a: ip_addrs) {
2232 entity_addr_t addr;
2233 addr.parse(a);
2234 addr.set_type(entity_addr_t::TYPE_LEGACY);
2235 ASSERT_FALSE(osdmap.is_blocklisted(addr, g_ceph_context));
2236 }
2237 for (const auto& a: unblocked_ip_addrs) {
2238 entity_addr_t addr;
2239 addr.parse(a);
2240 addr.set_type(entity_addr_t::TYPE_LEGACY);
2241 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2242 if (blocklisted) {
2243 cout << "erroneously blocklisted " << addr << std::endl;
2244 }
2245 EXPECT_FALSE(blocklisted);
2246 }
2247 }
2248
2249 TEST_F(OSDMapTest, blocklisting_everything) {
2250 set_up_map(6); //whatever
2251 OSDMap::Incremental range_blocklist_inc(osdmap.get_epoch() + 1);
2252 entity_addr_t baddr;
2253 baddr.parse("2001:db8::/0");
2254 baddr.type = entity_addr_t::TYPE_CIDR;
2255 range_blocklist_inc.new_range_blocklist[baddr] = ceph_clock_now();
2256 osdmap.apply_incremental(range_blocklist_inc);
2257
2258 for (const auto& a: ip_addrs) {
2259 entity_addr_t addr;
2260 addr.parse(a);
2261 addr.set_type(entity_addr_t::TYPE_LEGACY);
2262 if (addr.is_ipv4()) continue;
2263 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2264 if (!blocklisted) {
2265 cout << "erroneously not blocklisted " << addr << std::endl;
2266 }
2267 ASSERT_TRUE(blocklisted);
2268 }
2269 for (const auto& a: unblocked_ip_addrs) {
2270 entity_addr_t addr;
2271 addr.parse(a);
2272 addr.set_type(entity_addr_t::TYPE_LEGACY);
2273 if (addr.is_ipv4()) continue;
2274 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2275 if (!blocklisted) {
2276 cout << "erroneously not blocklisted " << addr << std::endl;
2277 }
2278 ASSERT_TRUE(blocklisted);
2279 }
2280
2281 OSDMap::Incremental swap_blocklist_inc(osdmap.get_epoch()+1);
2282 swap_blocklist_inc.old_range_blocklist.push_back(baddr);
2283
2284 entity_addr_t caddr;
2285 caddr.parse("1.1.1.1/0");
2286 caddr.type = entity_addr_t::TYPE_CIDR;
2287 swap_blocklist_inc.new_range_blocklist[caddr] = ceph_clock_now();
2288 osdmap.apply_incremental(swap_blocklist_inc);
2289
2290 for (const auto& a: ip_addrs) {
2291 entity_addr_t addr;
2292 addr.parse(a);
2293 addr.set_type(entity_addr_t::TYPE_LEGACY);
2294 if (!addr.is_ipv4()) continue;
2295 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2296 if (!blocklisted) {
2297 cout << "erroneously not blocklisted " << addr << std::endl;
2298 }
2299 ASSERT_TRUE(blocklisted);
2300 }
2301 for (const auto& a: unblocked_ip_addrs) {
2302 entity_addr_t addr;
2303 addr.parse(a);
2304 addr.set_type(entity_addr_t::TYPE_LEGACY);
2305 if (!addr.is_ipv4()) continue;
2306 bool blocklisted = osdmap.is_blocklisted(addr, g_ceph_context);
2307 if (!blocklisted) {
2308 cout << "erroneously not blocklisted " << addr << std::endl;
2309 }
2310 ASSERT_TRUE(blocklisted);
2311 }
2312 }
2313
2314 TEST_F(OSDMapTest, ReadBalanceScore1) {
2315 std::srand ( unsigned ( std::time(0) ) );
2316 uint osd_rand = rand() % 13;
2317 set_up_map(6 + osd_rand); //whatever
2318 auto pools = osdmap.get_pools();
2319 for (auto &[pid, pg_pool] : pools) {
2320 const pg_pool_t *pi = osdmap.get_pg_pool(pid);
2321 if (pi->is_replicated()) {
2322 //cout << "pool " << pid << " " << pg_pool << std::endl;
2323 auto replica_count = pi->get_size();
2324 OSDMap::read_balance_info_t rbi;
2325 auto rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi);
2326
2327 // "Normal" score is between 1 and num_osds
2328 ASSERT_TRUE(rc == 0);
2329 ASSERT_TRUE(score_in_range(rbi.adjusted_score));
2330 ASSERT_TRUE(score_in_range(rbi.acting_adj_score));
2331 ASSERT_TRUE(rbi.err_msg.empty());
2332
2333 // When all OSDs have primary_affinity 0, score should be 0
2334 auto num_osds = get_num_osds();
2335 set_primary_affinity_all(0.);
2336
2337 rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi);
2338 ASSERT_TRUE(rc < 0);
2339 ASSERT_TRUE(rbi.adjusted_score == 0.);
2340 ASSERT_TRUE(rbi.acting_adj_score == 0.);
2341 ASSERT_FALSE(rbi.err_msg.empty());
2342
2343 std::vector<uint> osds;
2344 for (uint i = 0 ; i < num_osds ; i++) {
2345 osds.push_back(i);
2346 }
2347
2348 // Change primary_affinity of some OSDs to 1 others are 0
2349 float fratio = 1. / (float)replica_count;
2350 for (int iter = 0 ; iter < 100 ; iter++) { // run the test 100 times
2351 // Create random shuffle of OSDs
2352 std::random_shuffle (osds.begin(), osds.end());
2353 for (uint i = 0 ; i < num_osds ; i++) {
2354 if ((float(i + 1) / float(num_osds)) < fratio) {
2355 ASSERT_TRUE(osds[i] < num_osds);
2356 osdmap.set_primary_affinity(osds[i], CEPH_OSD_MAX_PRIMARY_AFFINITY);
2357 rc = osdmap.calc_read_balance_score(g_ceph_context, pid, &rbi);
2358
2359 ASSERT_TRUE(rc < 0);
2360 ASSERT_TRUE(rbi.adjusted_score == 0.);
2361 ASSERT_TRUE(rbi.acting_adj_score == 0.);
2362 ASSERT_FALSE(rbi.err_msg.empty());
2363 }
2364 else {
2365 if (rc < 0) {
2366 ASSERT_TRUE(rbi.adjusted_score == 0.);
2367 ASSERT_TRUE(rbi.acting_adj_score == 0.);
2368 ASSERT_FALSE(rbi.err_msg.empty());
2369 }
2370 else {
2371 ASSERT_TRUE(score_in_range(rbi.acting_adj_score, i + 1));
2372 ASSERT_TRUE(rbi.err_msg.empty());
2373 }
2374 }
2375 }
2376 set_primary_affinity_all(0.);
2377 }
2378 }
2379 }
2380
2381 }
2382
2383 TEST_F(OSDMapTest, ReadBalanceScore2) {
2384 std::srand ( unsigned ( std::time(0) ) );
2385 uint osd_num = 6 + rand() % 13;
2386 set_up_map(osd_num, true);
2387 for (int i = 0 ; i < 100 ; i++) { //running 100 random tests
2388 uint num_pa_osds = 0;
2389 float pa_sum = 0.;
2390 OSDMap::read_balance_info_t rbi;
2391
2392 // set pa for all osds
2393 for (uint j = 0 ; j < osd_num ; j++) {
2394 uint pa = 1 + rand() % 100;
2395 if (pa > 80)
2396 pa = 100;
2397 if (pa < 20)
2398 pa = 0;
2399 float fpa = (float)pa / 100.;
2400 if (pa > 0) {
2401 num_pa_osds++;
2402 pa_sum += fpa;
2403 }
2404 osdmap.set_primary_affinity(j, int(fpa * CEPH_OSD_MAX_PRIMARY_AFFINITY));
2405 }
2406 float pa_ratio = pa_sum / (float) osd_num;
2407
2408 // create a pool with the current osdmap configuration
2409 OSDMap::Incremental new_pool_inc(osdmap.get_epoch() + 1);
2410 new_pool_inc.new_pool_max = osdmap.get_pool_max();
2411 new_pool_inc.fsid = osdmap.get_fsid();
2412 string pool_name = "rep_pool" + stringify(i);
2413 uint64_t new_pid = set_rep_pool(pool_name, new_pool_inc, false);
2414 ASSERT_TRUE(new_pid > 0);
2415 osdmap.apply_incremental(new_pool_inc);
2416
2417 // now run the test on the pool.
2418 const pg_pool_t *pi = osdmap.get_pg_pool(new_pid);
2419 ASSERT_NE(pi, nullptr);
2420 ASSERT_TRUE(pi->is_replicated());
2421 float fratio = 1. / (float)pi->get_size();
2422 auto rc = osdmap.calc_read_balance_score(g_ceph_context, new_pid, &rbi);
2423 if (pa_ratio < fratio) {
2424 ASSERT_TRUE(rc < 0);
2425 ASSERT_FALSE(rbi.err_msg.empty());
2426 ASSERT_TRUE(rbi.acting_adj_score == 0.);
2427 ASSERT_TRUE(rbi.adjusted_score == 0.);
2428 }
2429 else {
2430 if (rc < 0) {
2431 ASSERT_TRUE(rbi.adjusted_score == 0.);
2432 ASSERT_TRUE(rbi.acting_adj_score == 0.);
2433 ASSERT_FALSE(rbi.err_msg.empty());
2434 }
2435 else {
2436 if (rbi.err_msg.empty()) {
2437 ASSERT_TRUE(score_in_range(rbi.acting_adj_score, num_pa_osds));
2438 }
2439 }
2440 }
2441
2442 }
2443 //TODO add ReadBalanceScore3 - with weighted osds.
2444
2445 }
2446
2447 TEST_F(OSDMapTest, read_balance_small_map) {
2448 // Set up a map with 4 OSDs and default pools
2449 set_up_map(4);
2450
2451 const vector<string> test_cases = {"basic", "prim_affinity"};
2452 for (const auto & test : test_cases) {
2453 if (test == "prim_affinity") {
2454 // Make osd.0 off-limits for primaries by giving it prim affinity 0
2455 OSDMap::Incremental pending_inc0(osdmap.get_epoch() + 1);
2456 pending_inc0.new_primary_affinity[0] = 0;
2457 osdmap.apply_incremental(pending_inc0);
2458
2459 // Ensure osd.0 has no primaries assigned to it
2460 map<uint64_t,set<pg_t>> prim_pgs_by_osd, acting_prims_by_osd;
2461 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd, &acting_prims_by_osd);
2462 ASSERT_TRUE(prim_pgs_by_osd[0].size() == 0);
2463 ASSERT_TRUE(acting_prims_by_osd[0].size() == 0);
2464 }
2465
2466 // Make sure capacity is balanced first
2467 set<int64_t> only_pools;
2468 only_pools.insert(my_rep_pool);
2469 OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
2470 osdmap.calc_pg_upmaps(g_ceph_context,
2471 0,
2472 100,
2473 only_pools,
2474 &pending_inc);
2475 osdmap.apply_incremental(pending_inc);
2476
2477 // Get read balance score before balancing
2478 OSDMap::read_balance_info_t rb_info;
2479 auto rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2480 ASSERT_TRUE(rc >= 0);
2481 float read_balance_score_before = rb_info.adjusted_score;
2482
2483 // Calculate desired prim distributions to verify later
2484 map<uint64_t,set<pg_t>> prim_pgs_by_osd_2, acting_prims_by_osd_2;
2485 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_2, &acting_prims_by_osd_2);
2486 vector<uint64_t> osds_to_check;
2487 for (const auto & [osd, pgs] : prim_pgs_by_osd_2) {
2488 osds_to_check.push_back(osd);
2489 }
2490 map<uint64_t,float> desired_prim_dist;
2491 rc = osdmap.calc_desired_primary_distribution(g_ceph_context, my_rep_pool,
2492 osds_to_check, desired_prim_dist);
2493 ASSERT_TRUE(rc >= 0);
2494
2495 // Balance reads
2496 OSDMap::Incremental pending_inc_2(osdmap.get_epoch()+1);
2497 int num_changes = osdmap.balance_primaries(g_ceph_context, my_rep_pool, &pending_inc_2, osdmap);
2498 osdmap.apply_incremental(pending_inc_2);
2499
2500 if (test == "prim_affinity") {
2501 // Ensure osd.0 still has no primaries assigned to it
2502 map<uint64_t,set<pg_t>> prim_pgs_by_osd_3, acting_prims_by_osd_3;
2503 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_3, &acting_prims_by_osd_3);
2504 ASSERT_TRUE(prim_pgs_by_osd_3[0].size() == 0);
2505 ASSERT_TRUE(acting_prims_by_osd_3[0].size() == 0);
2506 }
2507
2508 // Get read balance score after balancing
2509 rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2510 ASSERT_TRUE(rc >= 0);
2511 float read_balance_score_after = rb_info.adjusted_score;
2512
2513 // Ensure the score hasn't gotten worse
2514 ASSERT_TRUE(read_balance_score_after <= read_balance_score_before);
2515
2516 // Check for improvements
2517 if (num_changes > 0) {
2518 ASSERT_TRUE(read_balance_score_after < read_balance_score_before);
2519
2520 // Check num primaries for each OSD is within range
2521 map<uint64_t,set<pg_t>> prim_pgs_by_osd_4, acting_prims_by_osd_4;
2522 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_4, &acting_prims_by_osd_4);
2523 for (const auto & [osd, primaries] : prim_pgs_by_osd_4) {
2524 ASSERT_TRUE(primaries.size() >= floor(desired_prim_dist[osd] - 1));
2525 ASSERT_TRUE(primaries.size() <= ceil(desired_prim_dist[osd] + 1));
2526 }
2527 }
2528 }
2529 }
2530
2531 TEST_F(OSDMapTest, read_balance_large_map) {
2532 // Set up a map with 60 OSDs and default pools
2533 set_up_map(60);
2534
2535 const vector<string> test_cases = {"basic", "prim_affinity"};
2536 for (const auto & test : test_cases) {
2537 if (test == "prim_affinity") {
2538 // Make osd.0 off-limits for primaries by giving it prim affinity 0
2539 OSDMap::Incremental pending_inc0(osdmap.get_epoch() + 1);
2540 pending_inc0.new_primary_affinity[0] = 0;
2541 osdmap.apply_incremental(pending_inc0);
2542
2543 // Ensure osd.0 has no primaries assigned to it
2544 map<uint64_t,set<pg_t>> prim_pgs_by_osd, acting_prims_by_osd;
2545 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd, &acting_prims_by_osd);
2546 ASSERT_TRUE(prim_pgs_by_osd[0].size() == 0);
2547 ASSERT_TRUE(acting_prims_by_osd[0].size() == 0);
2548 }
2549
2550 // Make sure capacity is balanced first
2551 set<int64_t> only_pools;
2552 only_pools.insert(my_rep_pool);
2553 OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
2554 osdmap.calc_pg_upmaps(g_ceph_context,
2555 0,
2556 100,
2557 only_pools,
2558 &pending_inc);
2559 osdmap.apply_incremental(pending_inc);
2560
2561 // Get read balance score before balancing
2562 OSDMap::read_balance_info_t rb_info;
2563 auto rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2564 ASSERT_TRUE(rc >= 0);
2565 float read_balance_score_before = rb_info.adjusted_score;
2566
2567 // Calculate desired prim distributions to verify later
2568 map<uint64_t,set<pg_t>> prim_pgs_by_osd_2, acting_prims_by_osd_2;
2569 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_2, &acting_prims_by_osd_2);
2570 vector<uint64_t> osds_to_check;
2571 for (auto [osd, pgs] : prim_pgs_by_osd_2) {
2572 osds_to_check.push_back(osd);
2573 }
2574 map<uint64_t,float> desired_prim_dist;
2575 rc = osdmap.calc_desired_primary_distribution(g_ceph_context, my_rep_pool,
2576 osds_to_check, desired_prim_dist);
2577 ASSERT_TRUE(rc >= 0);
2578
2579 // Balance reads
2580 OSDMap::Incremental pending_inc_2(osdmap.get_epoch()+1);
2581 int num_changes = osdmap.balance_primaries(g_ceph_context, my_rep_pool, &pending_inc_2, osdmap);
2582 osdmap.apply_incremental(pending_inc_2);
2583
2584 if (test == "prim_affinity") {
2585 // Ensure osd.0 still has no primaries assigned to it
2586 map<uint64_t,set<pg_t>> prim_pgs_by_osd_3, acting_prims_by_osd_3;
2587 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_3, &acting_prims_by_osd_3);
2588 ASSERT_TRUE(prim_pgs_by_osd_3[0].size() == 0);
2589 ASSERT_TRUE(acting_prims_by_osd_3[0].size() == 0);
2590 }
2591
2592 // Get read balance score after balancing
2593 rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2594 ASSERT_TRUE(rc >= 0);
2595 float read_balance_score_after = rb_info.adjusted_score;
2596
2597 // Ensure the score hasn't gotten worse
2598 ASSERT_TRUE(read_balance_score_after <= read_balance_score_before);
2599
2600 // Check for improvements
2601 if (num_changes > 0) {
2602 ASSERT_TRUE(read_balance_score_after < read_balance_score_before);
2603
2604 // Check num primaries for each OSD is within range
2605 map<uint64_t,set<pg_t>> prim_pgs_by_osd_4, acting_prims_by_osd_4;
2606 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_4, &acting_prims_by_osd_4);
2607 for (const auto & [osd, primaries] : prim_pgs_by_osd_4) {
2608 ASSERT_TRUE(primaries.size() >= floor(desired_prim_dist[osd] - 1));
2609 ASSERT_TRUE(primaries.size() <= ceil(desired_prim_dist[osd] + 1));
2610 }
2611 }
2612 }
2613 }
2614
2615 TEST_F(OSDMapTest, read_balance_random_map) {
2616 // Set up map with random number of OSDs
2617 std::srand ( unsigned ( std::time(0) ) );
2618 uint num_osds = 3 + (rand() % 10);
2619 ASSERT_TRUE(num_osds >= 3);
2620 set_up_map(num_osds);
2621
2622 const vector<string> test_cases = {"basic", "prim_affinity"};
2623 for (const auto & test : test_cases) {
2624 uint rand_osd = rand() % num_osds;
2625 if (test == "prim_affinity") {
2626 // Make a random OSD off-limits for primaries by giving it prim affinity 0
2627 ASSERT_TRUE(rand_osd < num_osds);
2628 OSDMap::Incremental pending_inc0(osdmap.get_epoch() + 1);
2629 pending_inc0.new_primary_affinity[rand_osd] = 0;
2630 osdmap.apply_incremental(pending_inc0);
2631
2632 // Ensure the random OSD has no primaries assigned to it
2633 map<uint64_t,set<pg_t>> prim_pgs_by_osd, acting_prims_by_osd;
2634 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd, &acting_prims_by_osd);
2635 ASSERT_TRUE(prim_pgs_by_osd[rand_osd].size() == 0);
2636 ASSERT_TRUE(acting_prims_by_osd[rand_osd].size() == 0);
2637 }
2638
2639 // Make sure capacity is balanced first
2640 set<int64_t> only_pools;
2641 only_pools.insert(my_rep_pool);
2642 OSDMap::Incremental pending_inc(osdmap.get_epoch()+1);
2643 osdmap.calc_pg_upmaps(g_ceph_context,
2644 0,
2645 100,
2646 only_pools,
2647 &pending_inc);
2648 osdmap.apply_incremental(pending_inc);
2649
2650 // Get read balance score before balancing
2651 OSDMap::read_balance_info_t rb_info;
2652 auto rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2653 ASSERT_TRUE(rc >= 0);
2654 float read_balance_score_before = rb_info.adjusted_score;
2655
2656 // Calculate desired prim distributions to verify later
2657 map<uint64_t,set<pg_t>> prim_pgs_by_osd_2, acting_prims_by_osd_2;
2658 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_2, &acting_prims_by_osd_2);
2659 vector<uint64_t> osds_to_check;
2660 for (const auto & [osd, pgs] : prim_pgs_by_osd_2) {
2661 osds_to_check.push_back(osd);
2662 }
2663 map<uint64_t,float> desired_prim_dist;
2664 rc = osdmap.calc_desired_primary_distribution(g_ceph_context, my_rep_pool,
2665 osds_to_check, desired_prim_dist);
2666 ASSERT_TRUE(rc >= 0);
2667
2668 // Balance reads
2669 OSDMap::Incremental pending_inc_2(osdmap.get_epoch()+1);
2670 int num_changes = osdmap.balance_primaries(g_ceph_context, my_rep_pool, &pending_inc_2, osdmap);
2671 osdmap.apply_incremental(pending_inc_2);
2672
2673 if (test == "prim_affinity") {
2674 // Ensure the random OSD still has no primaries assigned to it
2675 map<uint64_t,set<pg_t>> prim_pgs_by_osd_3, acting_prims_by_osd_3;
2676 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_3, &acting_prims_by_osd_3);
2677 ASSERT_TRUE(prim_pgs_by_osd_3[rand_osd].size() == 0);
2678 ASSERT_TRUE(acting_prims_by_osd_3[rand_osd].size() == 0);
2679 }
2680
2681 // Get read balance score after balancing
2682 rc = osdmap.calc_read_balance_score(g_ceph_context, my_rep_pool, &rb_info);
2683 ASSERT_TRUE(rc >= 0);
2684 float read_balance_score_after = rb_info.adjusted_score;
2685
2686 // Ensure the score hasn't gotten worse
2687 ASSERT_TRUE(read_balance_score_after <= read_balance_score_before);
2688
2689 // Check for improvements
2690 if (num_changes > 0) {
2691 ASSERT_TRUE(read_balance_score_after < read_balance_score_before);
2692
2693 // Check num primaries for each OSD is within range
2694 map<uint64_t,set<pg_t>> prim_pgs_by_osd_4, acting_prims_by_osd_4;
2695 osdmap.get_pgs_by_osd(g_ceph_context, my_rep_pool, &prim_pgs_by_osd_4, &acting_prims_by_osd_4);
2696 for (auto [osd, primaries] : prim_pgs_by_osd_4) {
2697 ASSERT_TRUE(primaries.size() >= floor(desired_prim_dist[osd] - 1));
2698 ASSERT_TRUE(primaries.size() <= ceil(desired_prim_dist[osd] + 1));
2699 }
2700 for (auto [osd, primaries] : prim_pgs_by_osd_4) {
2701 ASSERT_TRUE(primaries.size() >= floor(desired_prim_dist[osd] - 1));
2702 ASSERT_TRUE(primaries.size() <= ceil(desired_prim_dist[osd] + 1));
2703 }
2704 }
2705 }
2706 }
2707
2708 INSTANTIATE_TEST_SUITE_P(
2709 OSDMap,
2710 OSDMapTest,
2711 ::testing::Values(
2712 std::make_pair<int, int>(0, 1), // chooseleaf firstn 0 host
2713 std::make_pair<int, int>(3, 1), // chooseleaf firstn 3 host
2714 std::make_pair<int, int>(0, 0), // chooseleaf firstn 0 osd
2715 std::make_pair<int, int>(3, 0) // chooseleaf firstn 3 osd
2716 )
2717 );