]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/osd/OSDMap.cc
update sources to 12.2.7
[ceph.git] / ceph / src / osd / OSDMap.cc
index f2c4e34023b7c851588d746aaedfe39ffe439793..dc9e2e5083ec0663bb40ac1b87e5f8b151a43efa 100644 (file)
@@ -236,6 +236,8 @@ int OSDMap::Incremental::propagate_snaps_to_tiers(CephContext *cct,
   return 0;
 }
 
+// ----------------------------------
+// OSDMap
 
 bool OSDMap::subtree_is_down(int id, set<int> *down_cache) const
 {
@@ -1608,125 +1610,123 @@ void OSDMap::maybe_remove_pg_upmaps(CephContext *cct,
   OSDMap tmpmap;
   tmpmap.deepish_copy_from(osdmap);
   tmpmap.apply_incremental(*pending_inc);
+  set<pg_t> to_check;
+  set<pg_t> to_cancel;
+  map<int, map<int, float>> rule_weight_map;
 
   for (auto& p : tmpmap.pg_upmap) {
-    ldout(cct, 10) << __func__ << " pg_upmap entry "
-                   << "[" << p.first << ":" << p.second << "]"
-                   << dendl;
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
+    to_check.insert(p.first);
+  }
+  for (auto& p : tmpmap.pg_upmap_items) {
+    to_check.insert(p.first);
+  }
+  for (auto& p : pending_inc->new_pg_upmap) {
+    to_check.insert(p.first);
+  }
+  for (auto& p : pending_inc->new_pg_upmap_items) {
+    to_check.insert(p.first);
+  }
+  for (auto& pg : to_check) {
+    auto crush_rule = tmpmap.get_pg_pool_crush_rule(pg);
     if (crush_rule < 0) {
       lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << p.first << dendl;
+                 << pg << dendl;
       continue;
     }
+    map<int, float> weight_map;
+    auto it = rule_weight_map.find(crush_rule);
+    if (it == rule_weight_map.end()) {
+      auto r = tmpmap.crush->get_rule_weight_osd_map(crush_rule, &weight_map);
+      if (r < 0) {
+        lderr(cct) << __func__ << " unable to get crush weight_map for "
+                   << "crush_rule " << crush_rule << dendl;
+        continue;
+      }
+      rule_weight_map[crush_rule] = weight_map;
+    } else {
+      weight_map = it->second;
+    }
     auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
     if (type < 0) {
       lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << p.first << dendl;
-      continue;
-    } else if (type == 0) {
-      ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
-                     << " is osd-level, skipping"
-                     << dendl;
+                 << pg << dendl;
       continue;
     }
-    ldout(cct, 10) << __func__ << " pg " << p.first
+    ldout(cct, 10) << __func__ << " pg " << pg
                    << " crush-rule-id " << crush_rule
+                   << " weight_map " << weight_map
                    << " failure-domain-type " << type
                    << dendl;
     vector<int> raw;
     int primary;
-    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
+    tmpmap.pg_to_raw_up(pg, &raw, &primary);
     set<int> parents;
-    bool error = false;
-    bool collide = false;
     for (auto osd : raw) {
-      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
-      if (parent >= 0) {
-        lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
-                   << ", pg " << p.first
-                   << dendl;
-        error = true;
+      if (type > 0) {
+        auto parent = tmpmap.crush->get_parent_of_type(osd, type, crush_rule);
+        if (parent >= 0) {
+          lderr(cct) << __func__ << " unable to get parent of raw osd."
+                     << osd << " of pg " << pg
+                     << dendl;
+          break;
+        }
+        auto r = parents.insert(parent);
+        if (!r.second) {
+          // two up-set osds come from same parent
+          to_cancel.insert(pg);
+          break;
+        }
+      }
+      // the above check validates collision only
+      // below we continue to check against crush-topology changing..
+      auto it = weight_map.find(osd);
+      if (it == weight_map.end()) {
+        // osd is gone or has been moved out of the specific crush-tree
+        to_cancel.insert(pg);
         break;
       }
-      auto r = parents.insert(parent);
-      if (!r.second) {
-        collide = true;
+      auto adjusted_weight = tmpmap.get_weightf(it->first) * it->second;
+      if (adjusted_weight == 0) {
+        // osd is out/crush-out
+        to_cancel.insert(pg);
         break;
       }
     }
-    if (!error && collide) {
-      ldout(cct, 10) << __func__ << " removing invalid pg_upmap "
-                     << "[" << p.first << ":" << p.second << "]"
-                     << ", final mapping result will be: " << raw
-                     << dendl;
-      auto it = pending_inc->new_pg_upmap.find(p.first);
+  }
+  for (auto &pg: to_cancel) {
+    { // pg_upmap
+      auto it = pending_inc->new_pg_upmap.find(pg);
       if (it != pending_inc->new_pg_upmap.end()) {
+        ldout(cct, 10) << __func__ << " cancel invalid pending "
+                       << "pg_upmap entry "
+                       << it->first << "->" << it->second
+                       << dendl;
         pending_inc->new_pg_upmap.erase(it);
       }
-      if (osdmap.pg_upmap.count(p.first)) {
-        pending_inc->old_pg_upmap.insert(p.first);
-      }
-    }
-  }
-  for (auto& p : tmpmap.pg_upmap_items) {
-    ldout(cct, 10) << __func__ << " pg_upmap_items entry "
-                   << "[" << p.first << ":" << p.second << "]"
-                   << dendl;
-    auto crush_rule = tmpmap.get_pg_pool_crush_rule(p.first);
-    if (crush_rule < 0) {
-      lderr(cct) << __func__ << " unable to load crush-rule of pg "
-                 << p.first << dendl;
-      continue;
-    }
-    auto type = tmpmap.crush->get_rule_failure_domain(crush_rule);
-    if (type < 0) {
-      lderr(cct) << __func__ << " unable to load failure-domain-type of pg "
-                 << p.first << dendl;
-      continue;
-    } else if (type == 0) {
-      ldout(cct, 10) << __func__ << " failure-domain of pg " << p.first
-                     << " is osd-level, skipping"
-                     << dendl;
-      continue;
-    }
-    ldout(cct, 10) << __func__ << " pg " << p.first
-                   << " crush_rule_id " << crush_rule
-                   << " failure_domain_type " << type
-                   << dendl;
-    vector<int> raw;
-    int primary;
-    tmpmap.pg_to_raw_up(p.first, &raw, &primary);
-    set<int> parents;
-    bool error = false;
-    bool collide = false;
-    for (auto osd : raw) {
-      auto parent = tmpmap.crush->get_parent_of_type(osd, type);
-      if (parent >= 0) {
-        lderr(cct) << __func__ << " unable to get parent of raw osd." << osd
-                   << ", pg " << p.first
-                   << dendl;
-        error = true;
-        break;
-      }
-      auto r = parents.insert(parent);
-      if (!r.second) {
-        collide = true;
-        break;
+      if (osdmap.pg_upmap.count(pg)) {
+        ldout(cct, 10) << __func__ << " cancel invalid pg_upmap entry "
+                       << osdmap.pg_upmap.find(pg)->first << "->"
+                       << osdmap.pg_upmap.find(pg)->second
+                       << dendl;
+        pending_inc->old_pg_upmap.insert(pg);
       }
     }
-    if (!error && collide) {
-      ldout(cct, 10) << __func__ << " removing invalid pg_upmap_items "
-                     << "[" << p.first << ":" << p.second << "]"
-                     << ", final mapping result will be: " << raw
-                     << dendl;
-      // This is overkilling, but simpler..
-      auto it = pending_inc->new_pg_upmap_items.find(p.first);
+    { // pg_upmap_items
+      auto it = pending_inc->new_pg_upmap_items.find(pg);
       if (it != pending_inc->new_pg_upmap_items.end()) {
+        ldout(cct, 10) << __func__ << " cancel invalid pending "
+                       << "pg_upmap_items entry "
+                       << it->first << "->" << it->second
+                       << dendl;
         pending_inc->new_pg_upmap_items.erase(it);
       }
-      if (osdmap.pg_upmap_items.count(p.first)) {
-        pending_inc->old_pg_upmap_items.insert(p.first);
+      if (osdmap.pg_upmap_items.count(pg)) {
+        ldout(cct, 10) << __func__ << " cancel invalid "
+                       << "pg_upmap_items entry "
+                       << osdmap.pg_upmap_items.find(pg)->first << "->"
+                       << osdmap.pg_upmap_items.find(pg)->second
+                       << dendl;
+        pending_inc->old_pg_upmap_items.insert(pg);
       }
     }
   }
@@ -2349,6 +2349,24 @@ bool OSDMap::primary_changed(
   return false;      // same primary (tho replicas may have changed)
 }
 
+uint64_t OSDMap::get_encoding_features() const
+{
+  uint64_t f = SIGNIFICANT_FEATURES;
+  if (require_osd_release < CEPH_RELEASE_LUMINOUS) {
+    f &= ~(CEPH_FEATURE_SERVER_LUMINOUS |
+          CEPH_FEATURE_CRUSH_CHOOSE_ARGS);
+  }
+  if (require_osd_release < CEPH_RELEASE_KRAKEN) {
+    f &= ~(CEPH_FEATURE_SERVER_KRAKEN |
+          CEPH_FEATURE_MSG_ADDR2 |
+          CEPH_FEATURE_CRUSH_TUNABLES5);
+  }
+  if (require_osd_release < CEPH_RELEASE_JEWEL) {
+    f &= ~(CEPH_FEATURE_SERVER_JEWEL |
+          CEPH_FEATURE_NEW_OSDOP_ENCODING);
+  }
+  return f;
+}
 
 // serialize, unserialize
 void OSDMap::encode_client_old(bufferlist& bl) const
@@ -2487,6 +2505,8 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   ENCODE_START(8, 7, bl);
 
   {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
     uint8_t v = 6;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       v = 3;
@@ -2557,6 +2577,8 @@ void OSDMap::encode(bufferlist& bl, uint64_t features) const
   }
 
   {
+    // NOTE: any new encoding dependencies must be reflected by
+    // SIGNIFICANT_FEATURES
     uint8_t target_v = 5;
     if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
       target_v = 1;
@@ -4074,6 +4096,8 @@ int OSDMap::calc_pg_upmaps(
     for (auto p = deviation_osd.rbegin(); p != deviation_osd.rend(); ++p) {
       int osd = p->second;
       float deviation = p->first;
+      // make sure osd is still there (belongs to this crush-tree)
+      assert(osd_weight.count(osd));
       float target = osd_weight[osd] * pgs_per_weight;
       assert(target > 0);
       if (deviation/target < max_deviation_ratio) {