]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/test/osd/TestOSDMap.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / test / osd / TestOSDMap.cc
index 251ae4d994064301fb5ca9b89c6451990c563350..2bd873fd01a615125cce5defdd71e0a88ebfc7f0 100644 (file)
@@ -2,6 +2,7 @@
 #include "gtest/gtest.h"
 #include "osd/OSDMap.h"
 #include "osd/OSDMapMapping.h"
+#include "mon/OSDMonitor.h"
 
 #include "global/global_context.h"
 #include "global/global_init.h"
 using namespace std;
 
 int main(int argc, char **argv) {
+  map<string,string> defaults = {
+    // make sure we have 3 copies, or some tests won't work
+    { "osd_pool_default_size", "3" },
+    // our map is flat, so just try and split across OSDs, not hosts or whatever
+    { "osd_crush_chooseleaf_type", "0" },
+  };
   std::vector<const char*> args(argv, argv+argc);
-  env_to_vec(args);
-  auto cct = global_init(nullptr, args, CEPH_ENTITY_TYPE_CLIENT,
+  auto cct = global_init(&defaults, args, CEPH_ENTITY_TYPE_CLIENT,
                         CODE_ENVIRONMENT_UTILITY,
                         CINIT_FLAG_NO_DEFAULT_CONFIG_FILE);
   common_init_finish(g_ceph_context);
-  // make sure we have 3 copies, or some tests won't work
-  g_ceph_context->_conf->set_val("osd_pool_default_size", "3", false);
-  // our map is flat, so just try and split across OSDs, not hosts or whatever
-  g_ceph_context->_conf->set_val("osd_crush_chooseleaf_type", "0", false);
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
 
 class OSDMapTest : public testing::Test {
-  const static int num_osds = 6;
+  int num_osds = 6;
 public:
   OSDMap osdmap;
   OSDMapMapping mapping;
@@ -38,25 +40,29 @@ public:
 
   OSDMapTest() {}
 
-  void set_up_map() {
+  void set_up_map(int new_num_osds = 6, bool no_default_pools = false) {
+    num_osds = new_num_osds;
     uuid_d fsid;
     osdmap.build_simple(g_ceph_context, 0, fsid, num_osds);
     OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
     pending_inc.fsid = osdmap.get_fsid();
-    entity_addr_t sample_addr;
+    entity_addrvec_t sample_addrs;
+    sample_addrs.v.push_back(entity_addr_t());
     uuid_d sample_uuid;
     for (int i = 0; i < num_osds; ++i) {
       sample_uuid.generate_random();
-      sample_addr.nonce = i;
+      sample_addrs.v[0].nonce = i;
       pending_inc.new_state[i] = CEPH_OSD_EXISTS | CEPH_OSD_NEW;
-      pending_inc.new_up_client[i] = sample_addr;
-      pending_inc.new_up_cluster[i] = sample_addr;
-      pending_inc.new_hb_back_up[i] = sample_addr;
-      pending_inc.new_hb_front_up[i] = sample_addr;
+      pending_inc.new_up_client[i] = sample_addrs;
+      pending_inc.new_up_cluster[i] = sample_addrs;
+      pending_inc.new_hb_back_up[i] = sample_addrs;
+      pending_inc.new_hb_front_up[i] = sample_addrs;
       pending_inc.new_weight[i] = CEPH_OSD_IN;
       pending_inc.new_uuid[i] = sample_uuid;
     }
     osdmap.apply_incremental(pending_inc);
+    if (no_default_pools) // do not create any default pool(s)
+      return;
 
     // Create an EC ruleset and a pool using it
     int r = osdmap.crush->add_simple_rule(
@@ -70,7 +76,7 @@ public:
     pg_pool_t empty;
     // make an ec pool
     uint64_t pool_id = ++new_pool_inc.new_pool_max;
-    assert(pool_id == my_ec_pool);
+    ceph_assert(pool_id == my_ec_pool);
     pg_pool_t *p = new_pool_inc.get_new_pool(pool_id, &empty);
     p->size = 3;
     p->set_pg_num(64);
@@ -80,7 +86,7 @@ public:
     new_pool_inc.new_pool_names[pool_id] = "ec";
     // and a replicated pool
     pool_id = ++new_pool_inc.new_pool_max;
-    assert(pool_id == my_rep_pool);
+    ceph_assert(pool_id == my_rep_pool);
     p = new_pool_inc.get_new_pool(pool_id, &empty);
     p->size = 3;
     p->set_pg_num(64);
@@ -92,17 +98,17 @@ public:
     osdmap.apply_incremental(new_pool_inc);
   }
   unsigned int get_num_osds() { return num_osds; }
-  void get_crush(CrushWrapper& newcrush) {
+  void get_crush(const OSDMap& tmap, CrushWrapper& newcrush) {
     bufferlist bl;
-    osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
-    bufferlist::iterator p = bl.begin();
+    tmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
+    auto p = bl.cbegin();
     newcrush.decode(p);
   }
-  int crush_move(const string &name, const vector<string> &argvec) {
+  int crush_move(OSDMap& tmap, const string &name, const vector<string> &argvec) {
     map<string,string> loc;
     CrushWrapper::parse_loc_map(argvec, &loc);
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(tmap, newcrush);
     if (!newcrush.name_exists(name)) {
        return -ENOENT;
     }
@@ -115,10 +121,10 @@ public:
         err = newcrush.move_bucket(g_ceph_context, id, loc);
       }
       if (err >= 0) {
-        OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+        OSDMap::Incremental pending_inc(tmap.get_epoch() + 1);
         pending_inc.crush.clear();
         newcrush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
-        osdmap.apply_incremental(pending_inc);
+        tmap.apply_incremental(pending_inc);
         err = 0;
       }
     } else {
@@ -134,7 +140,7 @@ public:
       return osdmap.crush->get_rule_id(name);
     }
     CrushWrapper newcrush;
-    get_crush(newcrush);
+    get_crush(osdmap, newcrush);
     string device_class;
     stringstream ss;
     int ruleno = newcrush.add_simple_rule(
@@ -181,6 +187,21 @@ public:
     cout << "first: " << *first << std::endl;;
     cout << "primary: " << *primary << std::endl;;
   }
+  void clean_pg_upmaps(CephContext *cct,
+                       const OSDMap& om,
+                       OSDMap::Incremental& pending_inc) {
+    int cpu_num = 8;
+    int pgs_per_chunk = 256;
+    ThreadPool tp(cct, "BUG_40104::clean_upmap_tp", "clean_upmap_tp", cpu_num);
+    tp.start();
+    ParallelPGMapper mapper(cct, &tp);
+    vector<pg_t> pgs_to_check;
+    om.get_upmap_pgs(&pgs_to_check);
+    OSDMonitor::CleanUpmapJob job(cct, om, pending_inc);
+    mapper.queue(&job, pgs_per_chunk, pgs_to_check);
+    job.wait();
+    tp.stop();
+  }
 };
 
 TEST_F(OSDMapTest, Create) {
@@ -197,7 +218,6 @@ TEST_F(OSDMapTest, Features) {
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-  ASSERT_TRUE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
   ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
   ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
 
@@ -207,7 +227,6 @@ TEST_F(OSDMapTest, Features) {
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3);
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_V2);
-  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);  // dont' need this
   ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
   ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
 
@@ -224,7 +243,6 @@ TEST_F(OSDMapTest, Features) {
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES2);
   ASSERT_TRUE(features & CEPH_FEATURE_CRUSH_TUNABLES3); // shared bit with primary affinity
   ASSERT_FALSE(features & CEPH_FEATURE_CRUSH_V2);
-  ASSERT_FALSE(features & CEPH_FEATURE_OSD_ERASURE_CODES);
   ASSERT_TRUE(features & CEPH_FEATURE_OSDHASHPSPOOL);
   ASSERT_TRUE(features & CEPH_FEATURE_OSD_PRIMARY_AFFINITY);
 
@@ -235,7 +253,7 @@ TEST_F(OSDMapTest, MapPG) {
   set_up_map();
 
   std::cerr << " osdmap.pool_max==" << osdmap.get_pool_max() << std::endl;
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds, acting_osds;
   int up_primary, acting_primary;
@@ -254,7 +272,7 @@ TEST_F(OSDMapTest, MapPG) {
 TEST_F(OSDMapTest, MapFunctionsMatch) {
   // TODO: make sure pg_to_up_acting_osds and pg_to_acting_osds match
   set_up_map();
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds, acting_osds;
   int up_primary, acting_primary;
@@ -282,7 +300,7 @@ TEST_F(OSDMapTest, MapFunctionsMatch) {
 TEST_F(OSDMapTest, PrimaryIsFirst) {
   set_up_map();
 
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds, acting_osds;
   int up_primary, acting_primary;
@@ -296,7 +314,7 @@ TEST_F(OSDMapTest, PrimaryIsFirst) {
 TEST_F(OSDMapTest, PGTempRespected) {
   set_up_map();
 
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds, acting_osds;
   int up_primary, acting_primary;
@@ -324,7 +342,7 @@ TEST_F(OSDMapTest, PGTempRespected) {
 TEST_F(OSDMapTest, PrimaryTempRespected) {
   set_up_map();
 
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds;
   vector<int> acting_osds;
@@ -371,7 +389,10 @@ TEST_F(OSDMapTest, CleanTemps) {
 
   osdmap.apply_incremental(pgtemp_map);
 
-  OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
+  OSDMap tmpmap;
+  tmpmap.deepish_copy_from(osdmap);
+  tmpmap.apply_incremental(pending_inc);
+  OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
 
   EXPECT_TRUE(pending_inc.new_pg_temp.count(pga) &&
              pending_inc.new_pg_temp[pga].size() == 0);
@@ -384,7 +405,7 @@ TEST_F(OSDMapTest, CleanTemps) {
 TEST_F(OSDMapTest, KeepsNecessaryTemps) {
   set_up_map();
 
-  pg_t rawpg(0, my_rep_pool, -1);
+  pg_t rawpg(0, my_rep_pool);
   pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
   vector<int> up_osds, acting_osds;
   int up_primary, acting_primary;
@@ -421,7 +442,10 @@ TEST_F(OSDMapTest, KeepsNecessaryTemps) {
 
   OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
 
-  OSDMap::clean_temps(g_ceph_context, osdmap, &pending_inc);
+  OSDMap tmpmap;
+  tmpmap.deepish_copy_from(osdmap);
+  tmpmap.apply_incremental(pending_inc);
+  OSDMap::clean_temps(g_ceph_context, osdmap, tmpmap, &pending_inc);
   EXPECT_FALSE(pending_inc.new_pg_temp.count(pgid));
   EXPECT_FALSE(pending_inc.new_primary_temp.count(pgid));
 }
@@ -501,6 +525,37 @@ TEST_F(OSDMapTest, PrimaryAffinity) {
   }
 }
 
+TEST_F(OSDMapTest, get_osd_crush_node_flags) {
+  set_up_map();
+
+  for (unsigned i=0; i<get_num_osds(); ++i) {
+    ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(i));
+  }
+
+  OSDMap::Incremental inc(osdmap.get_epoch() + 1);
+  inc.new_crush_node_flags[-1] = 123u;
+  osdmap.apply_incremental(inc);
+  for (unsigned i=0; i<get_num_osds(); ++i) {
+    ASSERT_EQ(123u, osdmap.get_osd_crush_node_flags(i));
+  }
+  ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
+
+  OSDMap::Incremental inc3(osdmap.get_epoch() + 1);
+  inc3.new_crush_node_flags[-1] = 456u;
+  osdmap.apply_incremental(inc3);
+  for (unsigned i=0; i<get_num_osds(); ++i) {
+    ASSERT_EQ(456u, osdmap.get_osd_crush_node_flags(i));
+  }
+  ASSERT_EQ(0u, osdmap.get_osd_crush_node_flags(1000));
+
+  OSDMap::Incremental inc2(osdmap.get_epoch() + 1);
+  inc2.new_crush_node_flags[-1] = 0;
+  osdmap.apply_incremental(inc2);
+  for (unsigned i=0; i<get_num_osds(); ++i) {
+    ASSERT_EQ(0u, osdmap.get_crush_node_flags(i));
+  }
+}
+
 TEST_F(OSDMapTest, parse_osd_id_list) {
   set_up_map();
   set<int> out;
@@ -508,20 +563,20 @@ TEST_F(OSDMapTest, parse_osd_id_list) {
   osdmap.get_all_osds(all);
 
   ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0"}, &out, &cout));
-  ASSERT_EQ(1, out.size());
+  ASSERT_EQ(1u, out.size());
   ASSERT_EQ(0, *out.begin());
 
   ASSERT_EQ(0, osdmap.parse_osd_id_list({"1"}, &out, &cout));
-  ASSERT_EQ(1, out.size());
+  ASSERT_EQ(1u, out.size());
   ASSERT_EQ(1, *out.begin());
 
   ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","osd.1"}, &out, &cout));
-  ASSERT_EQ(2, out.size());
+  ASSERT_EQ(2u, out.size());
   ASSERT_EQ(0, *out.begin());
   ASSERT_EQ(1, *out.rbegin());
 
   ASSERT_EQ(0, osdmap.parse_osd_id_list({"osd.0","1"}, &out, &cout));
-  ASSERT_EQ(2, out.size());
+  ASSERT_EQ(2u, out.size());
   ASSERT_EQ(0, *out.begin());
   ASSERT_EQ(1, *out.rbegin());
 
@@ -559,7 +614,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     move_to.push_back("root=default");
     string host_loc = "host=" + host_name.str();
     move_to.push_back(host_loc);
-    int r = crush_move(osd_name.str(), move_to);
+    int r = crush_move(osdmap, osd_name.str(), move_to);
     ASSERT_EQ(0, r);
   }
   const string upmap_rule = "upmap";
@@ -628,7 +683,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     nextmap.apply_incremental(pending_inc);
     ASSERT_TRUE(nextmap.have_pg_upmaps(pgid));
     OSDMap::Incremental new_pending_inc(nextmap.get_epoch() + 1);
-    nextmap.clean_pg_upmaps(g_ceph_context, &new_pending_inc);
+    clean_pg_upmaps(g_ceph_context, nextmap, new_pending_inc);
     nextmap.apply_incremental(new_pending_inc);
     ASSERT_TRUE(!nextmap.have_pg_upmaps(pgid));
   }
@@ -675,9 +730,9 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
       ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
     }
     {
-      // confirm *maybe_remove_pg_upmaps* won't do anything bad
+      // confirm *clean_pg_upmaps* won't do anything bad
       OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
-      tmpmap.maybe_remove_pg_upmaps(g_ceph_context, tmpmap, &pending_inc);
+      clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
       tmpmap.apply_incremental(pending_inc);
       ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
     }
@@ -725,16 +780,138 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
       ASSERT_TRUE(tmpmap.have_pg_upmaps(ec_pgid));
     }
     {
-      // *maybe_remove_pg_upmaps* should be able to remove the above *bad* mapping
+      // *clean_pg_upmaps* should be able to remove the above *bad* mapping
       OSDMap::Incremental pending_inc(tmpmap.get_epoch() + 1);
-      OSDMap nextmap;
-      nextmap.deepish_copy_from(tmpmap);
-      nextmap.maybe_remove_pg_upmaps(g_ceph_context, nextmap, &pending_inc);
+      clean_pg_upmaps(g_ceph_context, tmpmap, pending_inc);
       tmpmap.apply_incremental(pending_inc);
       ASSERT_TRUE(!tmpmap.have_pg_upmaps(ec_pgid));
     }
   }
 
+  {
+    // http://tracker.ceph.com/issues/37968
+    
+    // build a temporary crush topology of 2 hosts, 3 osds per host
+    OSDMap tmp; // use a tmpmap here, so we do not dirty origin map..
+    tmp.deepish_copy_from(osdmap);
+    const int expected_host_num = 2;
+    int osd_per_host = get_num_osds() / expected_host_num;
+    ASSERT_GE(osd_per_host, 3);
+    int index = 0;
+    for (int i = 0; i < (int)get_num_osds(); i++) {
+      if (i && i % osd_per_host == 0) {
+        ++index;
+      }
+      stringstream osd_name;
+      stringstream host_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      host_name << "host-" << index;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name.str();
+      move_to.push_back(host_loc);
+      auto r = crush_move(tmp, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+      
+    // build crush rule
+    CrushWrapper crush;
+    get_crush(tmp, crush);
+    string rule_name = "rule_37968";
+    int rule_type = pg_pool_t::TYPE_ERASURE;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    string root_name = "default";
+    int root = crush.get_item_id(root_name);
+    int min_size = 3;
+    int max_size = 4;
+    int steps = 6;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, root, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 1 /* host*/); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSE_INDEP, 2, 0 /* osd */); 
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      tmp.apply_incremental(pending_inc);
+    }
+
+    // create a erasuce-coded pool referencing the above rule
+    int64_t pool_37968;
+    {
+      OSDMap::Incremental new_pool_inc(tmp.get_epoch() + 1);
+      new_pool_inc.new_pool_max = tmp.get_pool_max();
+      new_pool_inc.fsid = tmp.get_fsid();
+      pg_pool_t empty;
+      pool_37968 = ++new_pool_inc.new_pool_max;
+      pg_pool_t *p = new_pool_inc.get_new_pool(pool_37968, &empty);
+      p->size = 4;
+      p->set_pg_num(8);
+      p->set_pgp_num(8);
+      p->type = pg_pool_t::TYPE_ERASURE;
+      p->crush_rule = rno;
+      p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+      new_pool_inc.new_pool_names[pool_37968] = "pool_37968";
+      tmp.apply_incremental(new_pool_inc);
+    }
+
+    pg_t ec_pg(0, pool_37968);
+    pg_t ec_pgid = tmp.raw_pg_to_pg(ec_pg);
+    int from = -1;
+    int to = -1;
+    {
+      // insert a valid pg_upmap_item
+      vector<int> ec_up;
+      int ec_up_primary;
+      tmp.pg_to_raw_up(ec_pgid, &ec_up, &ec_up_primary);
+      ASSERT_TRUE(ec_up.size() == 4);
+      from = *(ec_up.begin());
+      ASSERT_TRUE(from >= 0);
+      auto parent = tmp.crush->get_parent_of_type(from, 1 /* host */, rno);
+      ASSERT_TRUE(parent < 0);
+      // pick an osd of the same parent with *from*
+      for (int i = 0; i < (int)get_num_osds(); i++) {
+        if (std::find(ec_up.begin(), ec_up.end(), i) == ec_up.end()) {
+          auto p = tmp.crush->get_parent_of_type(i, 1 /* host */, rno);
+          if (p == parent) {
+            to = i;
+            break;
+          }
+        }
+      }
+      ASSERT_TRUE(to >= 0);
+      ASSERT_TRUE(from != to);
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(from, to));
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[ec_pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+          new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+    {
+      // *clean_pg_upmaps* should not remove the above upmap_item
+      OSDMap::Incremental pending_inc(tmp.get_epoch() + 1);
+      clean_pg_upmaps(g_ceph_context, tmp, pending_inc);
+      tmp.apply_incremental(pending_inc);
+      ASSERT_TRUE(tmp.have_pg_upmaps(ec_pgid));
+    }
+  }
+
   {
     // TEST pg_upmap
     {
@@ -794,7 +971,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     {
       // STEP-2: apply cure
       OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
-      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
       osdmap.apply_incremental(pending_inc);
       {
         // validate pg_upmap is gone (reverted)
@@ -928,7 +1105,7 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
     {
       // STEP-4: apply cure
       OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
-      osdmap.maybe_remove_pg_upmaps(g_ceph_context, osdmap, &pending_inc);
+      clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
       osdmap.apply_incremental(pending_inc);
       {
         // validate pg_upmap_items is gone (reverted)
@@ -941,6 +1118,285 @@ TEST_F(OSDMapTest, CleanPGUpmaps) {
   }
 }
 
+TEST_F(OSDMapTest, BUG_38897) {
+  // http://tracker.ceph.com/issues/38897
+  // build a fresh map with 12 OSDs, without any default pools
+  set_up_map(12, true);
+  const string pool_1("pool1");
+  const string pool_2("pool2");
+  int64_t pool_1_id = -1;
+
+  {
+    // build customized crush rule for "pool1"
+    string host_name = "host_for_pool_1";
+    // build a customized host to capture osd.1~5
+    for (int i = 1; i < 5; i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool1";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool1"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pool_1_id = pool_id;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->min_size = 1;
+    p->set_pg_num(3);
+    p->set_pgp_num(3);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_1;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_1);
+    {
+      for (unsigned i = 0; i < 3; i++) {
+        // 1.x -> [1]
+        pg_t rawpg(i, pool_id);
+        pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+        vector<int> up;
+        int up_primary;
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 3);
+        ASSERT_TRUE(up[0] == 0);
+
+        // insert a new pg_upmap
+        vector<int32_t> new_up;
+        // and remap 1.x to osd.1 only
+        // this way osd.0 is deemed to be *underfull*
+        // and osd.1 is deemed to be *overfull*
+        new_up.push_back(1);
+        {
+          OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+          pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
+            new_up.begin(), new_up.end());
+          osdmap.apply_incremental(pending_inc);
+        }
+        osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+        ASSERT_TRUE(up.size() == 1);
+        ASSERT_TRUE(up[0] == 1);
+      }
+    }
+  }
+
+  {
+    // build customized crush rule for "pool2"
+    string host_name = "host_for_pool_2";
+    // build a customized host to capture osd.6~11
+    for (int i = 6; i < (int)get_num_osds(); i++) {
+      stringstream osd_name;
+      vector<string> move_to;
+      osd_name << "osd." << i;
+      move_to.push_back("root=default");
+      string host_loc = "host=" + host_name;
+      move_to.push_back(host_loc);
+      auto r = crush_move(osdmap, osd_name.str(), move_to);
+      ASSERT_EQ(0, r);
+    }
+    CrushWrapper crush;
+    get_crush(osdmap, crush);
+    auto host_id = crush.get_item_id(host_name);
+    ASSERT_TRUE(host_id < 0);
+    string rule_name = "rule_for_pool2";
+    int rule_type = pg_pool_t::TYPE_REPLICATED;
+    ASSERT_TRUE(!crush.rule_exists(rule_name));
+    int rno;
+    for (rno = 0; rno < crush.get_max_rules(); rno++) {
+      if (!crush.rule_exists(rno) && !crush.ruleset_exists(rno))
+        break;
+    }
+    int min_size = 3;
+    int max_size = 3;
+    int steps = 7;
+    crush_rule *rule = crush_make_rule(steps, rno, rule_type, min_size, max_size);
+    int step = 0;
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSELEAF_TRIES, 5, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_SET_CHOOSE_TRIES, 100, 0);
+    // always choose osd.0
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, 0, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    // then pick any other random osds
+    crush_rule_set_step(rule, step++, CRUSH_RULE_TAKE, host_id, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_CHOOSELEAF_FIRSTN, 2, 0);
+    crush_rule_set_step(rule, step++, CRUSH_RULE_EMIT, 0, 0);
+    ASSERT_TRUE(step == steps);
+    auto r = crush_add_rule(crush.get_crush_map(), rule, rno);
+    ASSERT_TRUE(r >= 0);
+    crush.set_rule_name(rno, rule_name);
+    {
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.crush.clear();
+      crush.encode(pending_inc.crush, CEPH_FEATURES_SUPPORTED_DEFAULT);
+      osdmap.apply_incremental(pending_inc);
+    }
+
+    // create "pool2"
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    auto pool_id = ++pending_inc.new_pool_max;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    // include a single PG
+    p->set_pg_num(1);
+    p->set_pgp_num(1);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = rno;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = pool_2;
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == pool_2);
+    pg_t rawpg(0, pool_id);
+    pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+    EXPECT_TRUE(!osdmap.have_pg_upmaps(pgid));
+    vector<int> up;
+    int up_primary;
+    osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+    ASSERT_TRUE(up.size() == 3);
+    ASSERT_TRUE(up[0] == 0);
+
+    {
+      // build a pg_upmap_item that will
+      // remap pg out from *underfull* osd.0
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      new_pg_upmap_items.push_back(make_pair(0, 10)); // osd.0 -> osd.10
+      OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+      pending_inc.new_pg_upmap_items[pgid] =
+      mempool::osdmap::vector<pair<int32_t,int32_t>>(
+        new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+      osdmap.apply_incremental(pending_inc);
+      ASSERT_TRUE(osdmap.have_pg_upmaps(pgid));
+      vector<int> up;
+      int up_primary;
+      osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+      ASSERT_TRUE(up.size() == 3);
+      ASSERT_TRUE(up[0] == 10);
+    }
+  }
+
+  // ready to go
+  {
+    // require perfect distribution!
+    auto ret = g_ceph_context->_conf.set_val(
+      "osd_calc_pg_upmaps_max_stddev", "0");
+    ASSERT_EQ(0, ret);
+    g_ceph_context->_conf.apply_changes(nullptr);
+    set<int64_t> only_pools;
+    ASSERT_TRUE(pool_1_id >= 0);
+    only_pools.insert(pool_1_id);
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    osdmap.calc_pg_upmaps(g_ceph_context,
+                          0, // so we can force optimizing
+                          100,
+                          only_pools,
+                          &pending_inc);
+    osdmap.apply_incremental(pending_inc);
+  }
+}
+
+TEST_F(OSDMapTest, BUG_40104) {
+  // http://tracker.ceph.com/issues/40104
+  int big_osd_num = 5000;
+  int big_pg_num = 10000;
+  set_up_map(big_osd_num, true);
+  int pool_id;
+  {
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    pending_inc.new_pool_max = osdmap.get_pool_max();
+    pool_id = ++pending_inc.new_pool_max;
+    pg_pool_t empty;
+    auto p = pending_inc.get_new_pool(pool_id, &empty);
+    p->size = 3;
+    p->min_size = 1;
+    p->set_pg_num(big_pg_num);
+    p->set_pgp_num(big_pg_num);
+    p->type = pg_pool_t::TYPE_REPLICATED;
+    p->crush_rule = 0;
+    p->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
+    pending_inc.new_pool_names[pool_id] = "big_pool";
+    osdmap.apply_incremental(pending_inc);
+    ASSERT_TRUE(osdmap.have_pg_pool(pool_id));
+    ASSERT_TRUE(osdmap.get_pool_name(pool_id) == "big_pool");
+  }
+  {
+    // generate pg_upmap_items for each pg
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    for (int i = 0; i < big_pg_num; i++) {
+      pg_t rawpg(i, pool_id);
+      pg_t pgid = osdmap.raw_pg_to_pg(rawpg);
+      vector<int> up;
+      int up_primary;
+      osdmap.pg_to_raw_up(pgid, &up, &up_primary);
+      ASSERT_TRUE(up.size() == 3);
+      int victim = up[0];
+      int replaced_by = random() % big_osd_num;
+      vector<pair<int32_t,int32_t>> new_pg_upmap_items;
+      // note that it might or might not be valid, we don't care
+      new_pg_upmap_items.push_back(make_pair(victim, replaced_by));
+      pending_inc.new_pg_upmap_items[pgid] =
+        mempool::osdmap::vector<pair<int32_t,int32_t>>(
+          new_pg_upmap_items.begin(), new_pg_upmap_items.end());
+    }
+    osdmap.apply_incremental(pending_inc);
+  }
+  {
+    OSDMap::Incremental pending_inc(osdmap.get_epoch() + 1);
+    auto start = mono_clock::now();
+    clean_pg_upmaps(g_ceph_context, osdmap, pending_inc);
+    auto latency = mono_clock::now() - start;
+    std::cout << "clean_pg_upmaps (~" << big_pg_num
+              << " pg_upmap_items) latency:" << timespan_str(latency)
+              << std::endl;
+  }
+}
+
 TEST(PGTempMap, basic)
 {
   PGTempMap m;