ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/ConfigKeyService.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 #define dout_subsys ceph_subsys_mon
  95 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
  96 static const string OSD_METADATA_PREFIX("osd_metadata");
  97 static const string OSD_SNAP_PREFIX("osd_snap");
  98
  99 /*
 100
 101   OSD snapshot metadata
 102   ---------------------
 103
 104   -- starting with mimic, removed in octopus --
 105
 106   "removed_epoch_%llu_%08lx" % (pool, epoch)
 107    -> interval_set<snapid_t>
 108
 109   "removed_snap_%llu_%016llx" % (pool, last_snap)
 110    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 111
 112
 113   -- starting with mimic --
 114
 115   "purged_snap_%llu_%016llx" % (pool, last_snap)
 116    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 117
 118   - note that the {removed,purged}_snap put the last snap in they key so
 119     that we can use forward iteration only to search for an epoch in an
 120     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 121     >= N that either does or doesn't contain the given snap.
 122
 123
 124   -- starting with octopus --
 125
 126   "purged_epoch_%08lx" % epoch
 127   -> map<int64_t,interval_set<snapid_t>>
 128
 129   */
 130 using namespace TOPNSPC::common;
 131 namespace {
 132
 133 struct OSDMemCache : public PriorityCache::PriCache {
 134   OSDMonitor *osdmon;
 135   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 136   int64_t committed_bytes = 0;
 137   double cache_ratio = 0;
 138
 139   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 140
 141   virtual uint64_t _get_used_bytes() const = 0;
 142
 143   virtual int64_t request_cache_bytes(
 144       PriorityCache::Priority pri, uint64_t total_cache) const {
 145     int64_t assigned = get_cache_bytes(pri);
 146
 147     switch (pri) {
 148     // All cache items are currently set to have PRI1 priority
 149     case PriorityCache::Priority::PRI1:
 150       {
 151         int64_t request = _get_used_bytes();
 152         return (request > assigned) ? request - assigned : 0;
 153       }
 154     default:
 155       break;
 156     }
 157     return -EOPNOTSUPP;
 158   }
 159
 160   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 161       return cache_bytes[pri];
 162   }
 163
 164   virtual int64_t get_cache_bytes() const {
 165     int64_t total = 0;
 166
 167     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 168       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 169       total += get_cache_bytes(pri);
 170     }
 171     return total;
 172   }
 173
 174   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 175     cache_bytes[pri] = bytes;
 176   }
 177   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 178     cache_bytes[pri] += bytes;
 179   }
 180   virtual int64_t commit_cache_size(uint64_t total_cache) {
 181     committed_bytes = PriorityCache::get_chunk(
 182         get_cache_bytes(), total_cache);
 183     return committed_bytes;
 184   }
 185   virtual int64_t get_committed_size() const {
 186     return committed_bytes;
 187   }
 188   virtual double get_cache_ratio() const {
 189     return cache_ratio;
 190   }
 191   virtual void set_cache_ratio(double ratio) {
 192     cache_ratio = ratio;
 193   }
 194   virtual string get_cache_name() const = 0;
 195 };
 196
 197 struct IncCache : public OSDMemCache {
 198   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 199
 200   virtual uint64_t _get_used_bytes() const {
 201     return osdmon->inc_osd_cache.get_bytes();
 202   }
 203
 204   virtual string get_cache_name() const {
 205     return "OSDMap Inc Cache";
 206   }
 207
 208   uint64_t _get_num_osdmaps() const {
 209     return osdmon->inc_osd_cache.get_size();
 210   }
 211 };
 212
 213 struct FullCache : public OSDMemCache {
 214   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 215
 216   virtual uint64_t _get_used_bytes() const {
 217     return osdmon->full_osd_cache.get_bytes();
 218   }
 219
 220   virtual string get_cache_name() const {
 221     return "OSDMap Full Cache";
 222   }
 223
 224   uint64_t _get_num_osdmaps() const {
 225     return osdmon->full_osd_cache.get_size();
 226   }
 227 };
 228
 229 std::shared_ptr<IncCache> inc_cache;
 230 std::shared_ptr<FullCache> full_cache;
 231
 232 const uint32_t MAX_POOL_APPLICATIONS = 4;
 233 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 234 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 235
 236 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 237   // Note: this doesn't include support for the application tag match
 238   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 239     auto& match = grant.match;
 240     if (match.is_match_all()) {
 241       return true;
 242     } else if (pool_name != nullptr &&
 243                !match.pool_namespace.pool_name.empty() &&
 244                match.pool_namespace.pool_name == *pool_name) {
 245       return true;
 246     }
 247   }
 248   return false;
 249 }
 250
 251 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 252                                     const KeyServer& key_server,
 253                                     const EntityName& entity_name,
 254                                     const MonCap& mon_caps,
 255                                     const entity_addr_t& peer_socket_addr,
 256                                     const std::string* pool_name)
 257 {
 258   typedef std::map<std::string, std::string> CommandArgs;
 259
 260   if (mon_caps.is_capable(
 261         cct, entity_name, "osd",
 262         "osd pool op unmanaged-snap",
 263         (pool_name == nullptr ?
 264          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 265          CommandArgs{{"poolname", *pool_name}}),
 266         false, true, false,
 267         peer_socket_addr)) {
 268     return true;
 269   }
 270
 271   AuthCapsInfo caps_info;
 272   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 273                                    caps_info)) {
 274     dout(10) << "unable to locate OSD cap data for " << entity_name
 275              << " in auth db" << dendl;
 276     return false;
 277   }
 278
 279   string caps_str;
 280   if (caps_info.caps.length() > 0) {
 281     auto p = caps_info.caps.cbegin();
 282     try {
 283       decode(caps_str, p);
 284     } catch (const buffer::error &err) {
 285       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 286            << dendl;
 287       return false;
 288     }
 289   }
 290
 291   OSDCap osd_cap;
 292   if (!osd_cap.parse(caps_str, nullptr)) {
 293     dout(10) << "unable to parse OSD cap data for " << entity_name
 294              << " in auth db" << dendl;
 295     return false;
 296   }
 297
 298   // if the entity has write permissions in one or all pools, permit
 299   // usage of unmanaged-snapshots
 300   if (osd_cap.allow_all()) {
 301     return true;
 302   }
 303
 304   for (auto& grant : osd_cap.grants) {
 305     if (grant.profile.is_valid()) {
 306       for (auto& profile_grant : grant.profile_grants) {
 307         if (is_osd_writable(profile_grant, pool_name)) {
 308           return true;
 309         }
 310       }
 311     } else if (is_osd_writable(grant, pool_name)) {
 312       return true;
 313     }
 314   }
 315
 316   return false;
 317 }
 318
 319 } // anonymous namespace
 320
 321 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 322 {
 323   if (epoch_by_pg.size() <= ps) {
 324     epoch_by_pg.resize(ps + 1, 0);
 325   }
 326   const auto old_lec = epoch_by_pg[ps];
 327   if (old_lec >= last_epoch_clean) {
 328     // stale lec
 329     return;
 330   }
 331   epoch_by_pg[ps] = last_epoch_clean;
 332   if (last_epoch_clean < floor) {
 333     floor = last_epoch_clean;
 334   } else if (last_epoch_clean > floor) {
 335     if (old_lec == floor) {
 336       // probably should increase floor?
 337       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 338                                         std::end(epoch_by_pg));
 339       floor = *new_floor;
 340     }
 341   }
 342   if (ps != next_missing) {
 343     return;
 344   }
 345   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 346     if (epoch_by_pg[next_missing] == 0) {
 347       break;
 348     }
 349   }
 350 }
 351
 352 void LastEpochClean::remove_pool(uint64_t pool)
 353 {
 354   report_by_pool.erase(pool);
 355 }
 356
 357 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 358 {
 359   auto& lec = report_by_pool[pg.pool()];
 360   return lec.report(pg.ps(), last_epoch_clean);
 361 }
 362
 363 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 364 {
 365   auto floor = latest.get_epoch();
 366   for (auto& pool : latest.get_pools()) {
 367     auto reported = report_by_pool.find(pool.first);
 368     if (reported == report_by_pool.end()) {
 369       return 0;
 370     }
 371     if (reported->second.next_missing < pool.second.get_pg_num()) {
 372       return 0;
 373     }
 374     if (reported->second.floor < floor) {
 375       floor = reported->second.floor;
 376     }
 377   }
 378   return floor;
 379 }
 380
 381
 382 class C_UpdateCreatingPGs : public Context {
 383 public:
 384   OSDMonitor *osdmon;
 385   utime_t start;
 386   epoch_t epoch;
 387   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 388     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 389   void finish(int r) override {
 390     if (r >= 0) {
 391       utime_t end = ceph_clock_now();
 392       dout(10) << "osdmap epoch " << epoch << " mapping took "
 393                << (end - start) << " seconds" << dendl;
 394       osdmon->update_creating_pgs();
 395       osdmon->check_pg_creates_subs();
 396     }
 397   }
 398 };
 399
 400 #undef dout_prefix
 401 #define dout_prefix _prefix(_dout, mon, osdmap)
 402 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const OSDMap& osdmap) {
 403   return *_dout << "mon." << mon->name << "@" << mon->rank
 404                 << "(" << mon->get_state_name()
 405                 << ").osd e" << osdmap.get_epoch() << " ";
 406 }
 407
 408 OSDMonitor::OSDMonitor(
 409   CephContext *cct,
 410   Monitor *mn,
 411   Paxos *p,
 412   const string& service_name)
 413  : PaxosService(mn, p, service_name),
 414    cct(cct),
 415    inc_osd_cache(g_conf()->mon_osd_cache_size),
 416    full_osd_cache(g_conf()->mon_osd_cache_size),
 417    has_osdmap_manifest(false),
 418    mapper(mn->cct, &mn->cpu_tp)
 419 {
 420   inc_cache = std::make_shared<IncCache>(this);
 421   full_cache = std::make_shared<FullCache>(this);
 422   cct->_conf.add_observer(this);
 423   int r = _set_cache_sizes();
 424   if (r < 0) {
 425     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 426          << g_conf()->mon_osd_cache_size
 427          << ") without priority cache management"
 428          << dendl;
 429   }
 430 }
 431
 432 const char **OSDMonitor::get_tracked_conf_keys() const
 433 {
 434   static const char* KEYS[] = {
 435     "mon_memory_target",
 436     "mon_memory_autotune",
 437     "rocksdb_cache_size",
 438     NULL
 439   };
 440   return KEYS;
 441 }
 442
 443 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 444                                     const std::set<std::string> &changed)
 445 {
 446   dout(10) << __func__ << " " << changed << dendl;
 447
 448   if (changed.count("mon_memory_autotune")) {
 449     _set_cache_autotuning();
 450   }
 451   if (changed.count("mon_memory_target") ||
 452       changed.count("rocksdb_cache_size")) {
 453     int r = _update_mon_cache_settings();
 454     if (r < 0) {
 455       derr << __func__ << " mon_memory_target:"
 456            << g_conf()->mon_memory_target
 457            << " rocksdb_cache_size:"
 458            << g_conf()->rocksdb_cache_size
 459            << ". Unable to update cache size."
 460            << dendl;
 461     }
 462   }
 463 }
 464
 465 void OSDMonitor::_set_cache_autotuning()
 466 {
 467   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 468     // Disable cache autotuning
 469     std::lock_guard l(balancer_lock);
 470     pcm = nullptr;
 471   }
 472
 473   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 474     int r = register_cache_with_pcm();
 475     if (r < 0) {
 476       dout(10) << __func__
 477                << " Error while registering osdmon caches with pcm."
 478                << " Cache auto tuning not enabled."
 479                << dendl;
 480       mon_memory_autotune = false;
 481     } else {
 482       mon_memory_autotune = true;
 483     }
 484   }
 485 }
 486
 487 int OSDMonitor::_update_mon_cache_settings()
 488 {
 489   if (g_conf()->mon_memory_target <= 0 ||
 490       g_conf()->mon_memory_target < mon_memory_min ||
 491       g_conf()->rocksdb_cache_size <= 0) {
 492     return -EINVAL;
 493   }
 494
 495   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 496     derr << __func__ << " not using pcm and rocksdb" << dendl;
 497     return -EINVAL;
 498   }
 499
 500   uint64_t old_mon_memory_target = mon_memory_target;
 501   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 502
 503   // Set the new pcm memory cache sizes
 504   mon_memory_target = g_conf()->mon_memory_target;
 505   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 506
 507   uint64_t base = mon_memory_base;
 508   double fragmentation = mon_memory_fragmentation;
 509   uint64_t target = mon_memory_target;
 510   uint64_t min = mon_memory_min;
 511   uint64_t max = min;
 512
 513   uint64_t ltarget = (1.0 - fragmentation) * target;
 514   if (ltarget > base + min) {
 515     max = ltarget - base;
 516   }
 517
 518   int r = _set_cache_ratios();
 519   if (r < 0) {
 520     derr << __func__ << " Cache ratios for pcm could not be set."
 521          << " Review the kv (rocksdb) and mon_memory_target sizes."
 522          << dendl;
 523     mon_memory_target = old_mon_memory_target;
 524     rocksdb_cache_size = old_rocksdb_cache_size;
 525     return -EINVAL;
 526   }
 527
 528   if (mon_memory_autotune && pcm != nullptr) {
 529     std::lock_guard l(balancer_lock);
 530     // set pcm cache levels
 531     pcm->set_target_memory(target);
 532     pcm->set_min_memory(min);
 533     pcm->set_max_memory(max);
 534     // tune memory based on new values
 535     pcm->tune_memory();
 536     pcm->balance();
 537     _set_new_cache_sizes();
 538     dout(1) << __func__ << " Updated mon cache setting."
 539              << " target: " << target
 540              << " min: " << min
 541              << " max: " << max
 542              << dendl;
 543   }
 544   return 0;
 545 }
 546
 547 int OSDMonitor::_set_cache_sizes()
 548 {
 549   if (g_conf()->mon_memory_autotune) {
 550     // set the new osdmon cache targets to be managed by pcm
 551     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 552     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 553     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 554     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 555     mon_memory_target = g_conf()->mon_memory_target;
 556     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 557     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 558       derr << __func__ << " mon_memory_target:" << mon_memory_target
 559            << " mon_memory_min:" << mon_memory_min
 560            << ". Invalid size option(s) provided."
 561            << dendl;
 562       return -EINVAL;
 563     }
 564     // Set the initial inc and full LRU cache sizes
 565     inc_osd_cache.set_bytes(mon_memory_min);
 566     full_osd_cache.set_bytes(mon_memory_min);
 567     mon_memory_autotune = g_conf()->mon_memory_autotune;
 568   }
 569   return 0;
 570 }
 571
 572 bool OSDMonitor::_have_pending_crush()
 573 {
 574   return pending_inc.crush.length() > 0;
 575 }
 576
 577 CrushWrapper &OSDMonitor::_get_stable_crush()
 578 {
 579   return *osdmap.crush;
 580 }
 581
 582 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 583 {
 584   bufferlist bl;
 585   if (pending_inc.crush.length())
 586     bl = pending_inc.crush;
 587   else
 588     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 589
 590   auto p = bl.cbegin();
 591   newcrush.decode(p);
 592 }
 593
 594 void OSDMonitor::create_initial()
 595 {
 596   dout(10) << "create_initial for " << mon->monmap->fsid << dendl;
 597
 598   OSDMap newmap;
 599
 600   bufferlist bl;
 601   mon->store->get("mkfs", "osdmap", bl);
 602
 603   if (bl.length()) {
 604     newmap.decode(bl);
 605     newmap.set_fsid(mon->monmap->fsid);
 606   } else {
 607     newmap.build_simple(cct, 0, mon->monmap->fsid, 0);
 608   }
 609   newmap.set_epoch(1);
 610   newmap.created = newmap.modified = ceph_clock_now();
 611
 612   // new clusters should sort bitwise by default.
 613   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 614
 615   newmap.flags |=
 616     CEPH_OSDMAP_RECOVERY_DELETES |
 617     CEPH_OSDMAP_PURGED_SNAPDIRS |
 618     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 619   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 620   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 621   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 622   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 623   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 624   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 625
 626   // new cluster should require latest by default
 627   if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
 628     if (g_conf().get_val<bool>("mon_debug_no_require_nautilus")) {
 629       derr << __func__ << " mon_debug_no_require_octopus and nautilus=true" << dendl;
 630       newmap.require_osd_release = ceph_release_t::mimic;
 631     } else {
 632       derr << __func__ << " mon_debug_no_require_octopus=true" << dendl;
 633       newmap.require_osd_release = ceph_release_t::nautilus;
 634     }
 635   } else {
 636     newmap.require_osd_release = ceph_release_t::octopus;
 637     ceph_release_t r = ceph_release_from_name(
 638       g_conf()->mon_osd_initial_require_min_compat_client);
 639     if (!r) {
 640       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 641     }
 642     newmap.require_min_compat_client = r;
 643   }
 644
 645   // encode into pending incremental
 646   uint64_t features = newmap.get_encoding_features();
 647   newmap.encode(pending_inc.fullmap,
 648                 features | CEPH_FEATURE_RESERVED);
 649   pending_inc.full_crc = newmap.get_crc();
 650   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 651 }
 652
 653 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 654 {
 655   s.insert(service_name);
 656   s.insert(OSD_PG_CREATING_PREFIX);
 657   s.insert(OSD_METADATA_PREFIX);
 658   s.insert(OSD_SNAP_PREFIX);
 659 }
 660
 661 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 662 {
 663   // we really don't care if the version has been updated, because we may
 664   // have trimmed without having increased the last committed; yet, we may
 665   // need to update the in-memory manifest.
 666   load_osdmap_manifest();
 667
 668   version_t version = get_last_committed();
 669   if (version == osdmap.epoch)
 670     return;
 671   ceph_assert(version > osdmap.epoch);
 672
 673   dout(15) << "update_from_paxos paxos e " << version
 674            << ", my e " << osdmap.epoch << dendl;
 675
 676   if (mapping_job) {
 677     if (!mapping_job->is_done()) {
 678       dout(1) << __func__ << " mapping job "
 679               << mapping_job.get() << " did not complete, "
 680               << mapping_job->shards << " left, canceling" << dendl;
 681       mapping_job->abort();
 682     }
 683     mapping_job.reset();
 684   }
 685
 686   load_health();
 687
 688   /*
 689    * We will possibly have a stashed latest that *we* wrote, and we will
 690    * always be sure to have the oldest full map in the first..last range
 691    * due to encode_trim_extra(), which includes the oldest full map in the trim
 692    * transaction.
 693    *
 694    * encode_trim_extra() does not however write the full map's
 695    * version to 'full_latest'.  This is only done when we are building the
 696    * full maps from the incremental versions.  But don't panic!  We make sure
 697    * that the following conditions find whichever full map version is newer.
 698    */
 699   version_t latest_full = get_version_latest_full();
 700   if (latest_full == 0 && get_first_committed() > 1)
 701     latest_full = get_first_committed();
 702
 703   if (get_first_committed() > 1 &&
 704       latest_full < get_first_committed()) {
 705     // the monitor could be just sync'ed with its peer, and the latest_full key
 706     // is not encoded in the paxos commits in encode_pending(), so we need to
 707     // make sure we get it pointing to a proper version.
 708     version_t lc = get_last_committed();
 709     version_t fc = get_first_committed();
 710
 711     dout(10) << __func__ << " looking for valid full map in interval"
 712              << " [" << fc << ", " << lc << "]" << dendl;
 713
 714     latest_full = 0;
 715     for (version_t v = lc; v >= fc; v--) {
 716       string full_key = "full_" + stringify(v);
 717       if (mon->store->exists(get_service_name(), full_key)) {
 718         dout(10) << __func__ << " found latest full map v " << v << dendl;
 719         latest_full = v;
 720         break;
 721       }
 722     }
 723
 724     ceph_assert(latest_full > 0);
 725     auto t(std::make_shared<MonitorDBStore::Transaction>());
 726     put_version_latest_full(t, latest_full);
 727     mon->store->apply_transaction(t);
 728     dout(10) << __func__ << " updated the on-disk full map version to "
 729              << latest_full << dendl;
 730   }
 731
 732   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 733     bufferlist latest_bl;
 734     get_version_full(latest_full, latest_bl);
 735     ceph_assert(latest_bl.length() != 0);
 736     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 737     osdmap = OSDMap();
 738     osdmap.decode(latest_bl);
 739   }
 740
 741   bufferlist bl;
 742   if (!mon->store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 743     auto p = bl.cbegin();
 744     std::lock_guard<std::mutex> l(creating_pgs_lock);
 745     creating_pgs.decode(p);
 746     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 747             << creating_pgs.last_scan_epoch
 748             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 749   } else {
 750     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 751             << dendl;
 752   }
 753
 754   // walk through incrementals
 755   MonitorDBStore::TransactionRef t;
 756   size_t tx_size = 0;
 757   while (version > osdmap.epoch) {
 758     bufferlist inc_bl;
 759     int err = get_version(osdmap.epoch+1, inc_bl);
 760     ceph_assert(err == 0);
 761     ceph_assert(inc_bl.length());
 762     // set priority cache manager levels if the osdmap is
 763     // being populated for the first time.
 764     if (mon_memory_autotune && pcm == nullptr) {
 765       int r = register_cache_with_pcm();
 766       if (r < 0) {
 767         dout(10) << __func__
 768                  << " Error while registering osdmon caches with pcm."
 769                  << " Proceeding without cache auto tuning."
 770                  << dendl;
 771       }
 772     }
 773
 774     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 775             << dendl;
 776     OSDMap::Incremental inc(inc_bl);
 777     err = osdmap.apply_incremental(inc);
 778     ceph_assert(err == 0);
 779
 780     if (!t)
 781       t.reset(new MonitorDBStore::Transaction);
 782
 783     // Write out the full map for all past epochs.  Encode the full
 784     // map with the same features as the incremental.  If we don't
 785     // know, use the quorum features.  If we don't know those either,
 786     // encode with all features.
 787     uint64_t f = inc.encode_features;
 788     if (!f)
 789       f = mon->get_quorum_con_features();
 790     if (!f)
 791       f = -1;
 792     bufferlist full_bl;
 793     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 794     tx_size += full_bl.length();
 795
 796     bufferlist orig_full_bl;
 797     get_version_full(osdmap.epoch, orig_full_bl);
 798     if (orig_full_bl.length()) {
 799       // the primary provided the full map
 800       ceph_assert(inc.have_crc);
 801       if (inc.full_crc != osdmap.crc) {
 802         // This will happen if the mons were running mixed versions in
 803         // the past or some other circumstance made the full encoded
 804         // maps divergent.  Reloading here will bring us back into
 805         // sync with the primary for this and all future maps.  OSDs
 806         // will also be brought back into sync when they discover the
 807         // crc mismatch and request a full map from a mon.
 808         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 809              << dendl;
 810
 811         dout(20) << __func__ << " my (bad) full osdmap:\n";
 812         JSONFormatter jf(true);
 813         jf.dump_object("osdmap", osdmap);
 814         jf.flush(*_dout);
 815         *_dout << "\nhexdump:\n";
 816         full_bl.hexdump(*_dout);
 817         *_dout << dendl;
 818
 819         osdmap = OSDMap();
 820         osdmap.decode(orig_full_bl);
 821
 822         dout(20) << __func__ << " canonical full osdmap:\n";
 823         JSONFormatter jf(true);
 824         jf.dump_object("osdmap", osdmap);
 825         jf.flush(*_dout);
 826         *_dout << "\nhexdump:\n";
 827         orig_full_bl.hexdump(*_dout);
 828         *_dout << dendl;
 829       }
 830     } else {
 831       ceph_assert(!inc.have_crc);
 832       put_version_full(t, osdmap.epoch, full_bl);
 833     }
 834     put_version_latest_full(t, osdmap.epoch);
 835
 836     // share
 837     dout(1) << osdmap << dendl;
 838
 839     if (osdmap.epoch == 1) {
 840       t->erase("mkfs", "osdmap");
 841     }
 842
 843     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 844       mon->store->apply_transaction(t);
 845       t = MonitorDBStore::TransactionRef();
 846       tx_size = 0;
 847     }
 848     for (const auto &osd_state : inc.new_state) {
 849       if (osd_state.second & CEPH_OSD_UP) {
 850         // could be marked up *or* down, but we're too lazy to check which
 851         last_osd_report.erase(osd_state.first);
 852       }
 853       if (osd_state.second & CEPH_OSD_EXISTS) {
 854         // could be created *or* destroyed, but we can safely drop it
 855         osd_epochs.erase(osd_state.first);
 856       }
 857     }
 858   }
 859
 860   if (t) {
 861     mon->store->apply_transaction(t);
 862   }
 863
 864   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 865     if (osdmap.is_out(o))
 866       continue;
 867     auto found = down_pending_out.find(o);
 868     if (osdmap.is_down(o)) {
 869       // populate down -> out map
 870       if (found == down_pending_out.end()) {
 871         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 872         down_pending_out[o] = ceph_clock_now();
 873       }
 874     } else {
 875       if (found != down_pending_out.end()) {
 876         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 877         down_pending_out.erase(found);
 878       }
 879     }
 880   }
 881   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 882
 883   check_osdmap_subs();
 884   check_pg_creates_subs();
 885
 886   share_map_with_random_osd();
 887   update_logger();
 888   process_failures();
 889
 890   // make sure our feature bits reflect the latest map
 891   update_msgr_features();
 892
 893   if (!mon->is_leader()) {
 894     // will be called by on_active() on the leader, avoid doing so twice
 895     start_mapping();
 896   }
 897 }
 898
 899 int OSDMonitor::register_cache_with_pcm()
 900 {
 901   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 902     derr << __func__ << " Invalid memory size specified for mon caches."
 903          << " Caches will not be auto-tuned."
 904          << dendl;
 905     return -EINVAL;
 906   }
 907   uint64_t base = mon_memory_base;
 908   double fragmentation = mon_memory_fragmentation;
 909   // For calculating total target memory, consider rocksdb cache size.
 910   uint64_t target = mon_memory_target;
 911   uint64_t min = mon_memory_min;
 912   uint64_t max = min;
 913
 914   // Apply the same logic as in bluestore to set the max amount
 915   // of memory to use for cache. Assume base memory for OSDMaps
 916   // and then add in some overhead for fragmentation.
 917   uint64_t ltarget = (1.0 - fragmentation) * target;
 918   if (ltarget > base + min) {
 919     max = ltarget - base;
 920   }
 921
 922   rocksdb_binned_kv_cache = mon->store->get_priority_cache();
 923   if (!rocksdb_binned_kv_cache) {
 924     derr << __func__ << " not using rocksdb" << dendl;
 925     return -EINVAL;
 926   }
 927
 928   int r = _set_cache_ratios();
 929   if (r < 0) {
 930     derr << __func__ << " Cache ratios for pcm could not be set."
 931          << " Review the kv (rocksdb) and mon_memory_target sizes."
 932          << dendl;
 933     return -EINVAL;
 934   }
 935
 936   pcm = std::make_shared<PriorityCache::Manager>(
 937       cct, min, max, target, true);
 938   pcm->insert("kv", rocksdb_binned_kv_cache, true);
 939   pcm->insert("inc", inc_cache, true);
 940   pcm->insert("full", full_cache, true);
 941   dout(1) << __func__ << " pcm target: " << target
 942            << " pcm max: " << max
 943            << " pcm min: " << min
 944            << " inc_osd_cache size: " << inc_osd_cache.get_size()
 945            << dendl;
 946   return 0;
 947 }
 948
 949 int OSDMonitor::_set_cache_ratios()
 950 {
 951   double old_cache_kv_ratio = cache_kv_ratio;
 952
 953   // Set the cache ratios for kv(rocksdb), inc and full caches
 954   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
 955   if (cache_kv_ratio >= 1.0) {
 956     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
 957          << ") must be in range [0,<1.0]."
 958          << dendl;
 959     cache_kv_ratio = old_cache_kv_ratio;
 960     return -EINVAL;
 961   }
 962   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
 963   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
 964   inc_cache->set_cache_ratio(cache_inc_ratio);
 965   full_cache->set_cache_ratio(cache_full_ratio);
 966
 967   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
 968            << " inc ratio " << cache_inc_ratio
 969            << " full ratio " << cache_full_ratio
 970            << dendl;
 971   return 0;
 972 }
 973
 974 void OSDMonitor::start_mapping()
 975 {
 976   // initiate mapping job
 977   if (mapping_job) {
 978     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
 979              << dendl;
 980     mapping_job->abort();
 981   }
 982   if (!osdmap.get_pools().empty()) {
 983     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
 984     mapping_job = mapping.start_update(osdmap, mapper,
 985                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
 986     dout(10) << __func__ << " started mapping job " << mapping_job.get()
 987              << " at " << fin->start << dendl;
 988     mapping_job->set_finish_event(fin);
 989   } else {
 990     dout(10) << __func__ << " no pools, no mapping job" << dendl;
 991     mapping_job = nullptr;
 992   }
 993 }
 994
 995 void OSDMonitor::update_msgr_features()
 996 {
 997   set<int> types;
 998   types.insert((int)entity_name_t::TYPE_OSD);
 999   types.insert((int)entity_name_t::TYPE_CLIENT);
1000   types.insert((int)entity_name_t::TYPE_MDS);
1001   types.insert((int)entity_name_t::TYPE_MON);
1002   for (set<int>::iterator q = types.begin(); q != types.end(); ++q) {
1003     uint64_t mask;
1004     uint64_t features = osdmap.get_features(*q, &mask);
1005     if ((mon->messenger->get_policy(*q).features_required & mask) != features) {
1006       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1007       ceph::net::Policy p = mon->messenger->get_policy(*q);
1008       p.features_required = (p.features_required & ~mask) | features;
1009       mon->messenger->set_policy(*q, p);
1010     }
1011   }
1012 }
1013
1014 void OSDMonitor::on_active()
1015 {
1016   update_logger();
1017
1018   if (mon->is_leader()) {
1019     mon->clog->debug() << "osdmap " << osdmap;
1020     if (!priority_convert) {
1021       // Only do this once at start-up
1022       convert_pool_priorities();
1023       priority_convert = true;
1024     }
1025   } else {
1026     list<MonOpRequestRef> ls;
1027     take_all_failures(ls);
1028     while (!ls.empty()) {
1029       MonOpRequestRef op = ls.front();
1030       op->mark_osdmon_event(__func__);
1031       dispatch(op);
1032       ls.pop_front();
1033     }
1034   }
1035   start_mapping();
1036 }
1037
1038 void OSDMonitor::on_restart()
1039 {
1040   last_osd_report.clear();
1041 }
1042
1043 void OSDMonitor::on_shutdown()
1044 {
1045   dout(10) << __func__ << dendl;
1046   if (mapping_job) {
1047     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1048              << dendl;
1049     mapping_job->abort();
1050   }
1051
1052   // discard failure info, waiters
1053   list<MonOpRequestRef> ls;
1054   take_all_failures(ls);
1055   ls.clear();
1056 }
1057
1058 void OSDMonitor::update_logger()
1059 {
1060   dout(10) << "update_logger" << dendl;
1061
1062   mon->cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1063   mon->cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1064   mon->cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1065   mon->cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1066 }
1067
1068 void OSDMonitor::create_pending()
1069 {
1070   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1071   pending_inc.fsid = mon->monmap->fsid;
1072   pending_metadata.clear();
1073   pending_metadata_rm.clear();
1074   pending_pseudo_purged_snaps.clear();
1075
1076   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1077
1078   // safety checks (this shouldn't really happen)
1079   {
1080     if (osdmap.backfillfull_ratio <= 0) {
1081       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1082       if (pending_inc.new_backfillfull_ratio > 1.0)
1083         pending_inc.new_backfillfull_ratio /= 100;
1084       dout(1) << __func__ << " setting backfillfull_ratio = "
1085               << pending_inc.new_backfillfull_ratio << dendl;
1086     }
1087     if (osdmap.full_ratio <= 0) {
1088       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1089       if (pending_inc.new_full_ratio > 1.0)
1090         pending_inc.new_full_ratio /= 100;
1091       dout(1) << __func__ << " setting full_ratio = "
1092               << pending_inc.new_full_ratio << dendl;
1093     }
1094     if (osdmap.nearfull_ratio <= 0) {
1095       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1096       if (pending_inc.new_nearfull_ratio > 1.0)
1097         pending_inc.new_nearfull_ratio /= 100;
1098       dout(1) << __func__ << " setting nearfull_ratio = "
1099               << pending_inc.new_nearfull_ratio << dendl;
1100     }
1101   }
1102
1103   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1104   // structure.
1105   if (osdmap.crush->has_legacy_rule_ids()) {
1106     CrushWrapper newcrush;
1107     _get_pending_crush(newcrush);
1108
1109     // First, for all pools, work out which rule they really used
1110     // by resolving ruleset to rule.
1111     for (const auto &i : osdmap.get_pools()) {
1112       const auto pool_id = i.first;
1113       const auto &pool = i.second;
1114       int new_rule_id = newcrush.find_rule(pool.crush_rule,
1115                                            pool.type, pool.size);
1116
1117       dout(1) << __func__ << " rewriting pool "
1118               << osdmap.get_pool_name(pool_id) << " crush ruleset "
1119               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1120       if (pending_inc.new_pools.count(pool_id) == 0) {
1121         pending_inc.new_pools[pool_id] = pool;
1122       }
1123       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1124     }
1125
1126     // Now, go ahead and renumber all the rules so that their
1127     // rule_id field corresponds to their position in the array
1128     auto old_to_new = newcrush.renumber_rules();
1129     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1130     for (const auto &i : old_to_new) {
1131       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1132     }
1133     pending_inc.crush.clear();
1134     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
1135   }
1136 }
1137
1138 creating_pgs_t
1139 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1140                                const OSDMap& nextmap)
1141 {
1142   dout(10) << __func__ << dendl;
1143   creating_pgs_t pending_creatings;
1144   {
1145     std::lock_guard<std::mutex> l(creating_pgs_lock);
1146     pending_creatings = creating_pgs;
1147   }
1148   // check for new or old pools
1149   if (pending_creatings.last_scan_epoch < inc.epoch) {
1150     unsigned queued = 0;
1151     queued += scan_for_creating_pgs(osdmap.get_pools(),
1152                                     inc.old_pools,
1153                                     inc.modified,
1154                                     &pending_creatings);
1155     queued += scan_for_creating_pgs(inc.new_pools,
1156                                     inc.old_pools,
1157                                     inc.modified,
1158                                     &pending_creatings);
1159     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1160     for (auto deleted_pool : inc.old_pools) {
1161       auto removed = pending_creatings.remove_pool(deleted_pool);
1162       dout(10) << __func__ << " " << removed
1163                << " pg removed because containing pool deleted: "
1164                << deleted_pool << dendl;
1165       last_epoch_clean.remove_pool(deleted_pool);
1166     }
1167     // pgmon updates its creating_pgs in check_osd_map() which is called by
1168     // on_active() and check_osd_map() could be delayed if lease expires, so its
1169     // creating_pgs could be stale in comparison with the one of osdmon. let's
1170     // trim them here. otherwise, they will be added back after being erased.
1171     unsigned removed = 0;
1172     for (auto& pg : pending_created_pgs) {
1173       dout(20) << __func__ << " noting created pg " << pg << dendl;
1174       pending_creatings.created_pools.insert(pg.pool());
1175       removed += pending_creatings.pgs.erase(pg);
1176     }
1177     pending_created_pgs.clear();
1178     dout(10) << __func__ << " " << removed
1179              << " pgs removed because they're created" << dendl;
1180     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1181   }
1182
1183   // filter out any pgs that shouldn't exist.
1184   {
1185     auto i = pending_creatings.pgs.begin();
1186     while (i != pending_creatings.pgs.end()) {
1187       if (!nextmap.pg_exists(i->first)) {
1188         dout(10) << __func__ << " removing pg " << i->first
1189                  << " which should not exist" << dendl;
1190         i = pending_creatings.pgs.erase(i);
1191       } else {
1192         ++i;
1193       }
1194     }
1195   }
1196
1197   // process queue
1198   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1199   const auto total = pending_creatings.pgs.size();
1200   while (pending_creatings.pgs.size() < max &&
1201          !pending_creatings.queue.empty()) {
1202     auto p = pending_creatings.queue.begin();
1203     int64_t poolid = p->first;
1204     dout(10) << __func__ << " pool " << poolid
1205              << " created " << p->second.created
1206              << " modified " << p->second.modified
1207              << " [" << p->second.start << "-" << p->second.end << ")"
1208              << dendl;
1209     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1210                                   p->second.end - p->second.start);
1211     ps_t first = p->second.start;
1212     ps_t end = first + n;
1213     for (ps_t ps = first; ps < end; ++ps) {
1214       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1215       // NOTE: use the *current* epoch as the PG creation epoch so that the
1216       // OSD does not have to generate a long set of PastIntervals.
1217       pending_creatings.pgs.emplace(
1218         pgid,
1219         creating_pgs_t::pg_create_info(inc.epoch,
1220                                        p->second.modified));
1221       dout(10) << __func__ << " adding " << pgid << dendl;
1222     }
1223     p->second.start = end;
1224     if (p->second.done()) {
1225       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1226       pending_creatings.queue.erase(p);
1227     } else {
1228       dout(10) << __func__ << " pool " << poolid
1229                << " now [" << p->second.start << "-" << p->second.end << ")"
1230                << dendl;
1231     }
1232   }
1233   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1234            << " pools" << dendl;
1235
1236   if (mon->monmap->min_mon_release >= ceph_release_t::octopus) {
1237     // walk creating pgs' history and past_intervals forward
1238     for (auto& i : pending_creatings.pgs) {
1239       // this mirrors PG::start_peering_interval()
1240       pg_t pgid = i.first;
1241
1242       // this is a bit imprecise, but sufficient?
1243       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1244         const pg_pool_t *pi;
1245         bool operator()(const set<pg_shard_t> &have) const {
1246           return have.size() >= pi->min_size;
1247         }
1248         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1249       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1250
1251       vector<int> up, acting;
1252       int up_primary, acting_primary;
1253       nextmap.pg_to_up_acting_osds(
1254         pgid, &up, &up_primary, &acting, &acting_primary);
1255       if (i.second.history.epoch_created == 0) {
1256         // new pg entry, set it up
1257         i.second.up = up;
1258         i.second.acting = acting;
1259         i.second.up_primary = up_primary;
1260         i.second.acting_primary = acting_primary;
1261         i.second.history = pg_history_t(i.second.create_epoch,
1262                                         i.second.create_stamp);
1263         dout(10) << __func__ << "  pg " << pgid << " just added, "
1264                  << " up " << i.second.up
1265                  << " p " << i.second.up_primary
1266                  << " acting " << i.second.acting
1267                  << " p " << i.second.acting_primary
1268                  << " history " << i.second.history
1269                  << " past_intervals " << i.second.past_intervals
1270                  << dendl;
1271      } else {
1272         std::stringstream debug;
1273         if (PastIntervals::check_new_interval(
1274               i.second.acting_primary, acting_primary,
1275               i.second.acting, acting,
1276               i.second.up_primary, up_primary,
1277               i.second.up, up,
1278               i.second.history.same_interval_since,
1279               i.second.history.last_epoch_clean,
1280               &nextmap,
1281               &osdmap,
1282               pgid,
1283               min_size_predicate,
1284               &i.second.past_intervals,
1285               &debug)) {
1286           epoch_t e = inc.epoch;
1287           i.second.history.same_interval_since = e;
1288           if (i.second.up != up) {
1289             i.second.history.same_up_since = e;
1290           }
1291           if (i.second.acting_primary != acting_primary) {
1292             i.second.history.same_primary_since = e;
1293           }
1294           if (pgid.is_split(
1295                 osdmap.get_pg_num(pgid.pool()),
1296                 nextmap.get_pg_num(pgid.pool()),
1297                 nullptr)) {
1298             i.second.history.last_epoch_split = e;
1299           }
1300           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1301                    << " up " << i.second.up << " -> " << up
1302                    << " p " << i.second.up_primary << " -> " << up_primary
1303                    << " acting " << i.second.acting << " -> " << acting
1304                    << " p " << i.second.acting_primary << " -> "
1305                    << acting_primary
1306                    << " history " << i.second.history
1307                    << " past_intervals " << i.second.past_intervals
1308                    << dendl;
1309           dout(20) << "  debug: " << debug.str() << dendl;
1310           i.second.up = up;
1311           i.second.acting = acting;
1312           i.second.up_primary = up_primary;
1313           i.second.acting_primary = acting_primary;
1314         }
1315       }
1316     }
1317   }
1318   dout(10) << __func__
1319            << " " << (pending_creatings.pgs.size() - total)
1320            << "/" << pending_creatings.pgs.size()
1321            << " pgs added from queued pools" << dendl;
1322   return pending_creatings;
1323 }
1324
1325 void OSDMonitor::maybe_prime_pg_temp()
1326 {
1327   bool all = false;
1328   if (pending_inc.crush.length()) {
1329     dout(10) << __func__ << " new crush map, all" << dendl;
1330     all = true;
1331   }
1332
1333   if (!pending_inc.new_up_client.empty()) {
1334     dout(10) << __func__ << " new up osds, all" << dendl;
1335     all = true;
1336   }
1337
1338   // check for interesting OSDs
1339   set<int> osds;
1340   for (auto p = pending_inc.new_state.begin();
1341        !all && p != pending_inc.new_state.end();
1342        ++p) {
1343     if ((p->second & CEPH_OSD_UP) &&
1344         osdmap.is_up(p->first)) {
1345       osds.insert(p->first);
1346     }
1347   }
1348   for (map<int32_t,uint32_t>::iterator p = pending_inc.new_weight.begin();
1349        !all && p != pending_inc.new_weight.end();
1350        ++p) {
1351     if (p->second < osdmap.get_weight(p->first)) {
1352       // weight reduction
1353       osds.insert(p->first);
1354     } else {
1355       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1356                << dendl;
1357       all = true;
1358     }
1359   }
1360
1361   if (!all && osds.empty())
1362     return;
1363
1364   if (!all) {
1365     unsigned estimate =
1366       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1367     if (estimate > mapping.get_num_pgs() *
1368         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1369       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1370                << osds.size() << " osds >= "
1371                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1372                << mapping.get_num_pgs() << " pgs, all"
1373                << dendl;
1374       all = true;
1375     } else {
1376       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1377                << osds.size() << " osds" << dendl;
1378     }
1379   }
1380
1381   OSDMap next;
1382   next.deepish_copy_from(osdmap);
1383   next.apply_incremental(pending_inc);
1384
1385   if (next.get_pools().empty()) {
1386     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1387   } else if (all) {
1388     PrimeTempJob job(next, this);
1389     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1390     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1391       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1392     } else {
1393       dout(10) << __func__ << " did not finish in "
1394                << g_conf()->mon_osd_prime_pg_temp_max_time
1395                << ", stopping" << dendl;
1396       job.abort();
1397     }
1398   } else {
1399     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1400     utime_t stop = ceph_clock_now();
1401     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1402     const int chunk = 1000;
1403     int n = chunk;
1404     std::unordered_set<pg_t> did_pgs;
1405     for (auto osd : osds) {
1406       auto& pgs = mapping.get_osd_acting_pgs(osd);
1407       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1408       for (auto pgid : pgs) {
1409         if (!did_pgs.insert(pgid).second) {
1410           continue;
1411         }
1412         prime_pg_temp(next, pgid);
1413         if (--n <= 0) {
1414           n = chunk;
1415           if (ceph_clock_now() > stop) {
1416             dout(10) << __func__ << " consumed more than "
1417                      << g_conf()->mon_osd_prime_pg_temp_max_time
1418                      << " seconds, stopping"
1419                      << dendl;
1420             return;
1421           }
1422         }
1423       }
1424     }
1425   }
1426 }
1427
1428 void OSDMonitor::prime_pg_temp(
1429   const OSDMap& next,
1430   pg_t pgid)
1431 {
1432   // TODO: remove this creating_pgs direct access?
1433   if (creating_pgs.pgs.count(pgid)) {
1434     return;
1435   }
1436   if (!osdmap.pg_exists(pgid)) {
1437     return;
1438   }
1439
1440   vector<int> up, acting;
1441   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1442
1443   vector<int> next_up, next_acting;
1444   int next_up_primary, next_acting_primary;
1445   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1446                             &next_acting, &next_acting_primary);
1447   if (acting == next_acting &&
1448       !(up != acting && next_up == next_acting))
1449     return;  // no change since last epoch
1450
1451   if (acting.empty())
1452     return;  // if previously empty now we can be no worse off
1453   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1454   if (pool && acting.size() < pool->min_size)
1455     return;  // can be no worse off than before
1456
1457   if (next_up == next_acting) {
1458     acting.clear();
1459     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1460              << dendl;
1461   }
1462
1463   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1464            << " -> " << next_up << "/" << next_acting
1465            << ", priming " << acting
1466            << dendl;
1467   {
1468     std::lock_guard l(prime_pg_temp_lock);
1469     // do not touch a mapping if a change is pending
1470     pending_inc.new_pg_temp.emplace(
1471       pgid,
1472       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1473   }
1474 }
1475
1476 /**
1477  * @note receiving a transaction in this function gives a fair amount of
1478  * freedom to the service implementation if it does need it. It shouldn't.
1479  */
1480 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1481 {
1482   dout(10) << "encode_pending e " << pending_inc.epoch
1483            << dendl;
1484
1485   if (do_prune(t)) {
1486     dout(1) << __func__ << " osdmap full prune encoded e"
1487             << pending_inc.epoch << dendl;
1488   }
1489
1490   // finalize up pending_inc
1491   pending_inc.modified = ceph_clock_now();
1492
1493   int r = pending_inc.propagate_snaps_to_tiers(cct, osdmap);
1494   ceph_assert(r == 0);
1495
1496   if (mapping_job) {
1497     if (!mapping_job->is_done()) {
1498       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1499               << mapping_job.get() << " did not complete, "
1500               << mapping_job->shards << " left" << dendl;
1501       mapping_job->abort();
1502     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1503       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1504               << mapping_job.get() << " is prior epoch "
1505               << mapping.get_epoch() << dendl;
1506     } else {
1507       if (g_conf()->mon_osd_prime_pg_temp) {
1508         maybe_prime_pg_temp();
1509       }
1510     }
1511   } else if (g_conf()->mon_osd_prime_pg_temp) {
1512     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1513             << dendl;
1514   }
1515   mapping_job.reset();
1516
1517   // ensure we don't have blank new_state updates.  these are interrpeted as
1518   // CEPH_OSD_UP (and almost certainly not what we want!).
1519   auto p = pending_inc.new_state.begin();
1520   while (p != pending_inc.new_state.end()) {
1521     if (p->second == 0) {
1522       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1523       p = pending_inc.new_state.erase(p);
1524     } else {
1525       if (p->second & CEPH_OSD_UP) {
1526         pending_inc.new_last_up_change = pending_inc.modified;
1527       }
1528       ++p;
1529     }
1530   }
1531   if (!pending_inc.new_up_client.empty()) {
1532     pending_inc.new_last_up_change = pending_inc.modified;
1533   }
1534   for (auto& i : pending_inc.new_weight) {
1535     if (i.first >= osdmap.max_osd) {
1536       if (i.second) {
1537         // new osd is already marked in
1538         pending_inc.new_last_in_change = pending_inc.modified;
1539         break;
1540       }
1541     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1542       // existing osd marked in or out
1543       pending_inc.new_last_in_change = pending_inc.modified;
1544       break;
1545     }
1546   }
1547
1548   {
1549     OSDMap tmp;
1550     tmp.deepish_copy_from(osdmap);
1551     tmp.apply_incremental(pending_inc);
1552
1553     // clean pg_temp mappings
1554     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1555
1556     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1557     {
1558       // check every upmapped pg for now
1559       // until we could reliably identify certain cases to ignore,
1560       // which is obviously the hard part TBD..
1561       vector<pg_t> pgs_to_check;
1562       tmp.get_upmap_pgs(&pgs_to_check);
1563       if (pgs_to_check.size() <
1564           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1565         // not enough pgs, do it inline
1566         tmp.clean_pg_upmaps(cct, &pending_inc);
1567       } else {
1568         CleanUpmapJob job(cct, tmp, pending_inc);
1569         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1570         job.wait();
1571       }
1572     }
1573
1574     // update creating pgs first so that we can remove the created pgid and
1575     // process the pool flag removal below in the same osdmap epoch.
1576     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1577     bufferlist creatings_bl;
1578     uint64_t features = CEPH_FEATURES_ALL;
1579     if (mon->monmap->min_mon_release < ceph_release_t::octopus) {
1580       dout(20) << __func__ << " encoding pending pgs without octopus features"
1581                << dendl;
1582       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1583     }
1584     encode(pending_creatings, creatings_bl, features);
1585     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1586
1587     // remove any old (or incompat) POOL_CREATING flags
1588     for (auto& i : tmp.get_pools()) {
1589       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1590         // pre-nautilus OSDMaps shouldn't get this flag.
1591         if (pending_inc.new_pools.count(i.first)) {
1592           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1593         }
1594       }
1595       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1596           !pending_creatings.still_creating_pool(i.first)) {
1597         dout(10) << __func__ << " done creating pool " << i.first
1598                  << ", clearing CREATING flag" << dendl;
1599         if (pending_inc.new_pools.count(i.first) == 0) {
1600           pending_inc.new_pools[i.first] = i.second;
1601         }
1602         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1603       }
1604     }
1605
1606     // collect which pools are currently affected by
1607     // the near/backfill/full osd(s),
1608     // and set per-pool near/backfill/full flag instead
1609     set<int64_t> full_pool_ids;
1610     set<int64_t> backfillfull_pool_ids;
1611     set<int64_t> nearfull_pool_ids;
1612     tmp.get_full_pools(cct,
1613                        &full_pool_ids,
1614                        &backfillfull_pool_ids,
1615                          &nearfull_pool_ids);
1616     if (full_pool_ids.empty() ||
1617         backfillfull_pool_ids.empty() ||
1618         nearfull_pool_ids.empty()) {
1619       // normal case - no nearfull, backfillfull or full osds
1620         // try cancel any improper nearfull/backfillfull/full pool
1621         // flags first
1622       for (auto &pool: tmp.get_pools()) {
1623         auto p = pool.first;
1624         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1625             nearfull_pool_ids.empty()) {
1626           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1627                    << "'s nearfull flag" << dendl;
1628           if (pending_inc.new_pools.count(p) == 0) {
1629             // load original pool info first!
1630             pending_inc.new_pools[p] = pool.second;
1631           }
1632           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1633         }
1634         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1635             backfillfull_pool_ids.empty()) {
1636           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1637                    << "'s backfillfull flag" << dendl;
1638           if (pending_inc.new_pools.count(p) == 0) {
1639             pending_inc.new_pools[p] = pool.second;
1640           }
1641           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1642         }
1643         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1644             full_pool_ids.empty()) {
1645           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1646             // set by EQUOTA, skipping
1647             continue;
1648           }
1649           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1650                    << "'s full flag" << dendl;
1651           if (pending_inc.new_pools.count(p) == 0) {
1652             pending_inc.new_pools[p] = pool.second;
1653           }
1654           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1655         }
1656       }
1657     }
1658     if (!full_pool_ids.empty()) {
1659       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1660                << " as full" << dendl;
1661       for (auto &p: full_pool_ids) {
1662         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1663           continue;
1664         }
1665         if (pending_inc.new_pools.count(p) == 0) {
1666           pending_inc.new_pools[p] = tmp.pools[p];
1667         }
1668         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1669         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1670         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1671       }
1672       // cancel FLAG_FULL for pools which are no longer full too
1673       for (auto &pool: tmp.get_pools()) {
1674         auto p = pool.first;
1675         if (full_pool_ids.count(p)) {
1676           // skip pools we have just marked as full above
1677           continue;
1678         }
1679         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1680             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1681           // don't touch if currently is not full
1682           // or is running out of quota (and hence considered as full)
1683           continue;
1684         }
1685         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1686                  << "'s full flag" << dendl;
1687         if (pending_inc.new_pools.count(p) == 0) {
1688           pending_inc.new_pools[p] = pool.second;
1689         }
1690         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1691       }
1692     }
1693     if (!backfillfull_pool_ids.empty()) {
1694       for (auto &p: backfillfull_pool_ids) {
1695         if (full_pool_ids.count(p)) {
1696           // skip pools we have already considered as full above
1697           continue;
1698         }
1699         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1700           // make sure FLAG_FULL is truly set, so we are safe not
1701           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1702           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1703           continue;
1704         }
1705         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1706           // don't bother if pool is already marked as backfillfull
1707           continue;
1708         }
1709         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1710                  << "'s as backfillfull" << dendl;
1711         if (pending_inc.new_pools.count(p) == 0) {
1712           pending_inc.new_pools[p] = tmp.pools[p];
1713         }
1714         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1715         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1716       }
1717       // cancel FLAG_BACKFILLFULL for pools
1718       // which are no longer backfillfull too
1719       for (auto &pool: tmp.get_pools()) {
1720         auto p = pool.first;
1721         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1722           // skip pools we have just marked as backfillfull/full above
1723           continue;
1724         }
1725         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1726           // and don't touch if currently is not backfillfull
1727           continue;
1728         }
1729         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1730                  << "'s backfillfull flag" << dendl;
1731         if (pending_inc.new_pools.count(p) == 0) {
1732           pending_inc.new_pools[p] = pool.second;
1733         }
1734         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1735       }
1736     }
1737     if (!nearfull_pool_ids.empty()) {
1738       for (auto &p: nearfull_pool_ids) {
1739         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1740           continue;
1741         }
1742         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1743           // make sure FLAG_FULL is truly set, so we are safe not
1744           // to set a extra (redundant) FLAG_NEARFULL flag
1745           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1746           continue;
1747         }
1748         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1749           // don't bother if pool is already marked as nearfull
1750           continue;
1751         }
1752         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1753                  << "'s as nearfull" << dendl;
1754         if (pending_inc.new_pools.count(p) == 0) {
1755           pending_inc.new_pools[p] = tmp.pools[p];
1756         }
1757         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1758       }
1759       // cancel FLAG_NEARFULL for pools
1760       // which are no longer nearfull too
1761       for (auto &pool: tmp.get_pools()) {
1762         auto p = pool.first;
1763         if (full_pool_ids.count(p) ||
1764             backfillfull_pool_ids.count(p) ||
1765             nearfull_pool_ids.count(p)) {
1766           // skip pools we have just marked as
1767           // nearfull/backfillfull/full above
1768           continue;
1769         }
1770         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1771           // and don't touch if currently is not nearfull
1772           continue;
1773         }
1774         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1775                  << "'s nearfull flag" << dendl;
1776         if (pending_inc.new_pools.count(p) == 0) {
1777           pending_inc.new_pools[p] = pool.second;
1778         }
1779         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1780       }
1781     }
1782
1783     // min_compat_client?
1784     if (!tmp.require_min_compat_client) {
1785       auto mv = tmp.get_min_compat_client();
1786       dout(1) << __func__ << " setting require_min_compat_client to currently "
1787               << "required " << mv << dendl;
1788       mon->clog->info() << "setting require_min_compat_client to currently "
1789                         << "required " << mv;
1790       pending_inc.new_require_min_compat_client = mv;
1791     }
1792
1793     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1794         tmp.require_osd_release >= ceph_release_t::nautilus) {
1795       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1796       // add creating flags?
1797       for (auto& i : tmp.get_pools()) {
1798         if (pending_creatings.still_creating_pool(i.first)) {
1799           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1800                    << dendl;
1801           if (pending_inc.new_pools.count(i.first) == 0) {
1802             pending_inc.new_pools[i.first] = i.second;
1803           }
1804           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1805         }
1806       }
1807       // adjust blacklist items to all be TYPE_ANY
1808       for (auto& i : tmp.blacklist) {
1809         auto a = i.first;
1810         a.set_type(entity_addr_t::TYPE_ANY);
1811         pending_inc.new_blacklist[a] = i.second;
1812         pending_inc.old_blacklist.push_back(i.first);
1813       }
1814     }
1815
1816     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1817         tmp.require_osd_release >= ceph_release_t::octopus) {
1818       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1819
1820       // adjust obsoleted cache modes
1821       for (auto& [poolid, pi] : tmp.pools) {
1822         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1823           if (pending_inc.new_pools.count(poolid) == 0) {
1824             pending_inc.new_pools[poolid] = pi;
1825           }
1826           dout(10) << __func__ << " switching pool " << poolid
1827                    << " cachemode from forward -> proxy" << dendl;
1828           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1829         }
1830         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1831           if (pending_inc.new_pools.count(poolid) == 0) {
1832             pending_inc.new_pools[poolid] = pi;
1833           }
1834           dout(10) << __func__ << " switching pool " << poolid
1835                    << " cachemode from readforward -> readproxy" << dendl;
1836           pending_inc.new_pools[poolid].cache_mode =
1837             pg_pool_t::CACHEMODE_READPROXY;
1838         }
1839       }
1840
1841       // clear removed_snaps for every pool
1842       for (auto& [poolid, pi] : tmp.pools) {
1843         if (pi.removed_snaps.empty()) {
1844           continue;
1845         }
1846         if (pending_inc.new_pools.count(poolid) == 0) {
1847           pending_inc.new_pools[poolid] = pi;
1848         }
1849         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1850                  << dendl;
1851         pending_inc.new_pools[poolid].removed_snaps.clear();
1852       }
1853
1854       // create a combined purged snap epoch key for all purged snaps
1855       // prior to this epoch, and store it in the current epoch (i.e.,
1856       // the last pre-octopus epoch, just prior to the one we're
1857       // encoding now).
1858       auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
1859       it->lower_bound("purged_snap_");
1860       map<int64_t,snap_interval_set_t> combined;
1861       while (it->valid()) {
1862         if (it->key().find("purged_snap_") != 0) {
1863           break;
1864         }
1865         string k = it->key();
1866         long long unsigned pool;
1867         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1868         if (n != 1) {
1869           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1870         } else {
1871           bufferlist v = it->value();
1872           auto p = v.cbegin();
1873           snapid_t begin, end;
1874           ceph::decode(begin, p);
1875           ceph::decode(end, p);
1876           combined[pool].insert(begin, end - begin);
1877         }
1878         it->next();
1879       }
1880       if (!combined.empty()) {
1881         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1882         bufferlist v;
1883         ceph::encode(combined, v);
1884         t->put(OSD_SNAP_PREFIX, k, v);
1885         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1886                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1887                  << dendl;
1888       } else {
1889         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1890                  << dendl;
1891       }
1892
1893       // clean out the old removed_snap_ and removed_epoch keys
1894       // ('`' is ASCII '_' + 1)
1895       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1896       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1897     }
1898   }
1899
1900   // tell me about it
1901   for (auto i = pending_inc.new_state.begin();
1902        i != pending_inc.new_state.end();
1903        ++i) {
1904     int s = i->second ? i->second : CEPH_OSD_UP;
1905     if (s & CEPH_OSD_UP)
1906       dout(2) << " osd." << i->first << " DOWN" << dendl;
1907     if (s & CEPH_OSD_EXISTS)
1908       dout(2) << " osd." << i->first << " DNE" << dendl;
1909   }
1910   for (auto i = pending_inc.new_up_client.begin();
1911        i != pending_inc.new_up_client.end();
1912        ++i) {
1913     //FIXME: insert cluster addresses too
1914     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1915   }
1916   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1917        i != pending_inc.new_weight.end();
1918        ++i) {
1919     if (i->second == CEPH_OSD_OUT) {
1920       dout(2) << " osd." << i->first << " OUT" << dendl;
1921     } else if (i->second == CEPH_OSD_IN) {
1922       dout(2) << " osd." << i->first << " IN" << dendl;
1923     } else {
1924       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1925     }
1926   }
1927
1928   // features for osdmap and its incremental
1929   uint64_t features;
1930
1931   // encode full map and determine its crc
1932   OSDMap tmp;
1933   {
1934     tmp.deepish_copy_from(osdmap);
1935     tmp.apply_incremental(pending_inc);
1936
1937     // determine appropriate features
1938     features = tmp.get_encoding_features();
1939     dout(10) << __func__ << " encoding full map with "
1940              << tmp.require_osd_release
1941              << " features " << features << dendl;
1942
1943     // the features should be a subset of the mon quorum's features!
1944     ceph_assert((features & ~mon->get_quorum_con_features()) == 0);
1945
1946     bufferlist fullbl;
1947     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
1948     pending_inc.full_crc = tmp.get_crc();
1949
1950     // include full map in the txn.  note that old monitors will
1951     // overwrite this.  new ones will now skip the local full map
1952     // encode and reload from this.
1953     put_version_full(t, pending_inc.epoch, fullbl);
1954   }
1955
1956   // encode
1957   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
1958   bufferlist bl;
1959   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
1960
1961   dout(20) << " full_crc " << tmp.get_crc()
1962            << " inc_crc " << pending_inc.inc_crc << dendl;
1963
1964   /* put everything in the transaction */
1965   put_version(t, pending_inc.epoch, bl);
1966   put_last_committed(t, pending_inc.epoch);
1967
1968   // metadata, too!
1969   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
1970        p != pending_metadata.end();
1971        ++p)
1972     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
1973   for (set<int>::iterator p = pending_metadata_rm.begin();
1974        p != pending_metadata_rm.end();
1975        ++p)
1976     t->erase(OSD_METADATA_PREFIX, stringify(*p));
1977   pending_metadata.clear();
1978   pending_metadata_rm.clear();
1979
1980   // purged_snaps
1981   if (tmp.require_osd_release >= ceph_release_t::octopus &&
1982       !pending_inc.new_purged_snaps.empty()) {
1983     // all snaps purged this epoch (across all pools)
1984     string k = make_purged_snap_epoch_key(pending_inc.epoch);
1985     bufferlist v;
1986     encode(pending_inc.new_purged_snaps, v);
1987     t->put(OSD_SNAP_PREFIX, k, v);
1988   }
1989   for (auto& i : pending_inc.new_purged_snaps) {
1990     for (auto q = i.second.begin();
1991          q != i.second.end();
1992          ++q) {
1993       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
1994                                 pending_inc.epoch,
1995                                 t);
1996     }
1997   }
1998   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
1999     for (auto snap : snaps) {
2000       insert_purged_snap_update(pool, snap, snap + 1,
2001                                 pending_inc.epoch,
2002                                 t);
2003     }
2004   }
2005
2006   // health
2007   health_check_map_t next;
2008   tmp.check_health(cct, &next);
2009   encode_health(next, t);
2010 }
2011
2012 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2013 {
2014   bufferlist bl;
2015   int r = mon->store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2016   if (r < 0)
2017     return r;
2018   try {
2019     auto p = bl.cbegin();
2020     decode(m, p);
2021   }
2022   catch (buffer::error& e) {
2023     if (err)
2024       *err << "osd." << osd << " metadata is corrupt";
2025     return -EIO;
2026   }
2027   return 0;
2028 }
2029
2030 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2031 {
2032   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2033     if (osdmap.is_up(osd)) {
2034       map<string,string> meta;
2035       load_metadata(osd, meta, nullptr);
2036       auto p = meta.find(field);
2037       if (p == meta.end()) {
2038         (*out)["unknown"]++;
2039       } else {
2040         (*out)[p->second]++;
2041       }
2042     }
2043   }
2044 }
2045
2046 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2047 {
2048   map<string,int> by_val;
2049   count_metadata(field, &by_val);
2050   f->open_object_section(field.c_str());
2051   for (auto& p : by_val) {
2052     f->dump_int(p.first.c_str(), p.second);
2053   }
2054   f->close_section();
2055 }
2056
2057 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2058 {
2059   map<string, string> metadata;
2060   int r = load_metadata(osd, metadata, nullptr);
2061   if (r < 0)
2062     return r;
2063
2064   auto it = metadata.find("osd_objectstore");
2065   if (it == metadata.end())
2066     return -ENOENT;
2067   *type = it->second;
2068   return 0;
2069 }
2070
2071 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2072                                                  const pg_pool_t &pool,
2073                                                  ostream *err)
2074 {
2075   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2076   // since filestore osds could always join the pool later
2077   set<int> checked_osds;
2078   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2079     vector<int> up, acting;
2080     pg_t pgid(ps, pool_id);
2081     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2082     for (int osd : up) {
2083       if (checked_osds.find(osd) != checked_osds.end())
2084         continue;
2085       string objectstore_type;
2086       int r = get_osd_objectstore_type(osd, &objectstore_type);
2087       // allow with missing metadata, e.g. due to an osd never booting yet
2088       if (r < 0 || objectstore_type == "bluestore") {
2089         checked_osds.insert(osd);
2090         continue;
2091       }
2092       *err << "osd." << osd << " uses " << objectstore_type;
2093       return false;
2094     }
2095   }
2096   return true;
2097 }
2098
2099 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2100 {
2101   map<string,string> m;
2102   if (int r = load_metadata(osd, m, err))
2103     return r;
2104   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2105     f->dump_string(p->first.c_str(), p->second);
2106   return 0;
2107 }
2108
2109 void OSDMonitor::print_nodes(Formatter *f)
2110 {
2111   // group OSDs by their hosts
2112   map<string, list<int> > osds; // hostname => osd
2113   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2114     map<string, string> m;
2115     if (load_metadata(osd, m, NULL)) {
2116       continue;
2117     }
2118     map<string, string>::iterator hostname = m.find("hostname");
2119     if (hostname == m.end()) {
2120       // not likely though
2121       continue;
2122     }
2123     osds[hostname->second].push_back(osd);
2124   }
2125
2126   dump_services(f, osds, "osd");
2127 }
2128
2129 void OSDMonitor::share_map_with_random_osd()
2130 {
2131   if (osdmap.get_num_up_osds() == 0) {
2132     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2133     return;
2134   }
2135
2136   MonSession *s = mon->session_map.get_random_osd_session(&osdmap);
2137   if (!s) {
2138     dout(10) << __func__ << " no up osd on our session map" << dendl;
2139     return;
2140   }
2141
2142   dout(10) << "committed, telling random " << s->name
2143            << " all about it" << dendl;
2144
2145   // get feature of the peer
2146   // use quorum_con_features, if it's an anonymous connection.
2147   uint64_t features = s->con_features ? s->con_features :
2148                                         mon->get_quorum_con_features();
2149   // whatev, they'll request more if they need it
2150   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2151   s->con->send_message(m);
2152   // NOTE: do *not* record osd has up to this epoch (as we do
2153   // elsewhere) as they may still need to request older values.
2154 }
2155
2156 version_t OSDMonitor::get_trim_to() const
2157 {
2158   if (mon->get_quorum().empty()) {
2159     dout(10) << __func__ << ": quorum not formed" << dendl;
2160     return 0;
2161   }
2162
2163   {
2164     std::lock_guard<std::mutex> l(creating_pgs_lock);
2165     if (!creating_pgs.pgs.empty()) {
2166       return 0;
2167     }
2168   }
2169
2170   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2171     dout(0) << __func__
2172             << " blocking osdmap trim"
2173                " ('mon_debug_block_osdmap_trim' set to 'true')"
2174             << dendl;
2175     return 0;
2176   }
2177
2178   {
2179     epoch_t floor = get_min_last_epoch_clean();
2180     dout(10) << " min_last_epoch_clean " << floor << dendl;
2181     if (g_conf()->mon_osd_force_trim_to > 0 &&
2182         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2183       floor = g_conf()->mon_osd_force_trim_to;
2184       dout(10) << " explicit mon_osd_force_trim_to = " << floor << dendl;
2185     }
2186     unsigned min = g_conf()->mon_min_osdmap_epochs;
2187     if (floor + min > get_last_committed()) {
2188       if (min < get_last_committed())
2189         floor = get_last_committed() - min;
2190       else
2191         floor = 0;
2192     }
2193     if (floor > get_first_committed())
2194       return floor;
2195   }
2196   return 0;
2197 }
2198
2199 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2200 {
2201   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2202   // also scan osd epochs
2203   // don't trim past the oldest reported osd epoch
2204   for (auto& osd_epoch : osd_epochs) {
2205     if (osd_epoch.second < floor) {
2206       floor = osd_epoch.second;
2207     }
2208   }
2209   return floor;
2210 }
2211
2212 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2213                                    version_t first)
2214 {
2215   dout(10) << __func__ << " including full map for e " << first << dendl;
2216   bufferlist bl;
2217   get_version_full(first, bl);
2218   put_version_full(tx, first, bl);
2219
2220   if (has_osdmap_manifest &&
2221       first > osdmap_manifest.get_first_pinned()) {
2222     _prune_update_trimmed(tx, first);
2223   }
2224 }
2225
2226
2227 /* full osdmap prune
2228  *
2229  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2230  */
2231
2232 void OSDMonitor::load_osdmap_manifest()
2233 {
2234   bool store_has_manifest =
2235     mon->store->exists(get_service_name(), "osdmap_manifest");
2236
2237   if (!store_has_manifest) {
2238     if (!has_osdmap_manifest) {
2239       return;
2240     }
2241
2242     dout(20) << __func__
2243              << " dropping osdmap manifest from memory." << dendl;
2244     osdmap_manifest = osdmap_manifest_t();
2245     has_osdmap_manifest = false;
2246     return;
2247   }
2248
2249   dout(20) << __func__
2250            << " osdmap manifest detected in store; reload." << dendl;
2251
2252   bufferlist manifest_bl;
2253   int r = get_value("osdmap_manifest", manifest_bl);
2254   if (r < 0) {
2255     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2256     ceph_abort_msg("error reading manifest");
2257   }
2258   osdmap_manifest.decode(manifest_bl);
2259   has_osdmap_manifest = true;
2260
2261   dout(10) << __func__ << " store osdmap manifest pinned ("
2262            << osdmap_manifest.get_first_pinned()
2263            << " .. "
2264            << osdmap_manifest.get_last_pinned()
2265            << ")"
2266            << dendl;
2267 }
2268
2269 bool OSDMonitor::should_prune() const
2270 {
2271   version_t first = get_first_committed();
2272   version_t last = get_last_committed();
2273   version_t min_osdmap_epochs =
2274     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2275   version_t prune_min =
2276     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2277   version_t prune_interval =
2278     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2279   version_t last_pinned = osdmap_manifest.get_last_pinned();
2280   version_t last_to_pin = last - min_osdmap_epochs;
2281
2282   // Make it or break it constraints.
2283   //
2284   // If any of these conditions fails, we will not prune, regardless of
2285   // whether we have an on-disk manifest with an on-going pruning state.
2286   //
2287   if ((last - first) <= min_osdmap_epochs) {
2288     // between the first and last committed epochs, we don't have
2289     // enough epochs to trim, much less to prune.
2290     dout(10) << __func__
2291              << " currently holding only " << (last - first)
2292              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2293              << "); do not prune."
2294              << dendl;
2295     return false;
2296
2297   } else if ((last_to_pin - first) < prune_min) {
2298     // between the first committed epoch and the last epoch we would prune,
2299     // we simply don't have enough versions over the minimum to prune maps.
2300     dout(10) << __func__
2301              << " could only prune " << (last_to_pin - first)
2302              << " epochs (" << first << ".." << last_to_pin << "), which"
2303                 " is less than the required minimum (" << prune_min << ")"
2304              << dendl;
2305     return false;
2306
2307   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2308     dout(10) << __func__
2309              << " we have pruned as far as we can; do not prune."
2310              << dendl;
2311     return false;
2312
2313   } else if (last_pinned + prune_interval > last_to_pin) {
2314     dout(10) << __func__
2315              << " not enough epochs to form an interval (last pinned: "
2316              << last_pinned << ", last to pin: "
2317              << last_to_pin << ", interval: " << prune_interval << ")"
2318              << dendl;
2319     return false;
2320   }
2321
2322   dout(15) << __func__
2323            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2324            << " lc (" << first << ".." << last << ")"
2325            << dendl;
2326   return true;
2327 }
2328
2329 void OSDMonitor::_prune_update_trimmed(
2330     MonitorDBStore::TransactionRef tx,
2331     version_t first)
2332 {
2333   dout(10) << __func__
2334            << " first " << first
2335            << " last_pinned " << osdmap_manifest.get_last_pinned()
2336            << " last_pinned " << osdmap_manifest.get_last_pinned()
2337            << dendl;
2338
2339   osdmap_manifest_t manifest = osdmap_manifest;
2340
2341   if (!manifest.is_pinned(first)) {
2342     manifest.pin(first);
2343   }
2344
2345   set<version_t>::iterator p_end = manifest.pinned.find(first);
2346   set<version_t>::iterator p = manifest.pinned.begin();
2347   manifest.pinned.erase(p, p_end);
2348   ceph_assert(manifest.get_first_pinned() == first);
2349
2350   if (manifest.get_last_pinned() == first+1 ||
2351       manifest.pinned.size() == 1) {
2352     // we reached the end of the line, as pinned maps go; clean up our
2353     // manifest, and let `should_prune()` decide whether we should prune
2354     // again.
2355     tx->erase(get_service_name(), "osdmap_manifest");
2356     return;
2357   }
2358
2359   bufferlist bl;
2360   manifest.encode(bl);
2361   tx->put(get_service_name(), "osdmap_manifest", bl);
2362 }
2363
2364 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2365 {
2366   dout(1) << __func__ << dendl;
2367
2368   version_t pin_first;
2369
2370   // verify constrainsts on stable in-memory state
2371   if (!has_osdmap_manifest) {
2372     // we must have never pruned, OR if we pruned the state must no longer
2373     // be relevant (i.e., the state must have been removed alongside with
2374     // the trim that *must* have removed past the last pinned map in a
2375     // previous prune).
2376     ceph_assert(osdmap_manifest.pinned.empty());
2377     ceph_assert(!mon->store->exists(get_service_name(), "osdmap_manifest"));
2378     pin_first = get_first_committed();
2379
2380   } else {
2381     // we must have pruned in the past AND its state is still relevant
2382     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2383     // and thus we still hold a manifest in the store).
2384     ceph_assert(!osdmap_manifest.pinned.empty());
2385     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2386     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2387
2388     dout(10) << __func__
2389              << " first_pinned " << osdmap_manifest.get_first_pinned()
2390              << " last_pinned " << osdmap_manifest.get_last_pinned()
2391              << dendl;
2392
2393     pin_first = osdmap_manifest.get_last_pinned();
2394   }
2395
2396   manifest.pin(pin_first);
2397 }
2398
2399 bool OSDMonitor::_prune_sanitize_options() const
2400 {
2401   uint64_t prune_interval =
2402     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2403   uint64_t prune_min =
2404     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2405   uint64_t txsize =
2406     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2407
2408   bool r = true;
2409
2410   if (prune_interval == 0) {
2411     derr << __func__
2412          << " prune is enabled BUT prune interval is zero; abort."
2413          << dendl;
2414     r = false;
2415   } else if (prune_interval == 1) {
2416     derr << __func__
2417          << " prune interval is equal to one, which essentially means"
2418             " no pruning; abort."
2419          << dendl;
2420     r = false;
2421   }
2422   if (prune_min == 0) {
2423     derr << __func__
2424          << " prune is enabled BUT prune min is zero; abort."
2425          << dendl;
2426     r = false;
2427   }
2428   if (prune_interval > prune_min) {
2429     derr << __func__
2430          << " impossible to ascertain proper prune interval because"
2431          << " it is greater than the minimum prune epochs"
2432          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2433          << dendl;
2434     r = false;
2435   }
2436
2437   if (txsize < prune_interval - 1) {
2438     derr << __func__
2439          << "'mon_osdmap_full_prune_txsize' (" << txsize
2440          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2441          << "); abort." << dendl;
2442     r = false;
2443   }
2444   return r;
2445 }
2446
2447 bool OSDMonitor::is_prune_enabled() const {
2448   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2449 }
2450
2451 bool OSDMonitor::is_prune_supported() const {
2452   return mon->get_required_mon_features().contains_any(
2453       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2454 }
2455
2456 /** do_prune
2457  *
2458  * @returns true if has side-effects; false otherwise.
2459  */
2460 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2461 {
2462   bool enabled = is_prune_enabled();
2463
2464   dout(1) << __func__ << " osdmap full prune "
2465           << ( enabled ? "enabled" : "disabled")
2466           << dendl;
2467
2468   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2469     return false;
2470   }
2471
2472   // we are beyond the minimum prune versions, we need to remove maps because
2473   // otherwise the store will grow unbounded and we may end up having issues
2474   // with available disk space or store hangs.
2475
2476   // we will not pin all versions. We will leave a buffer number of versions.
2477   // this allows us the monitor to trim maps without caring too much about
2478   // pinned maps, and then allow us to use another ceph-mon without these
2479   // capabilities, without having to repair the store.
2480
2481   osdmap_manifest_t manifest = osdmap_manifest;
2482
2483   version_t first = get_first_committed();
2484   version_t last = get_last_committed();
2485
2486   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2487   version_t last_pinned = manifest.get_last_pinned();
2488   uint64_t prune_interval =
2489     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2490   uint64_t txsize =
2491     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2492
2493   prune_init(manifest);
2494
2495   // we need to get rid of some osdmaps
2496
2497   dout(5) << __func__
2498           << " lc (" << first << " .. " << last << ")"
2499           << " last_pinned " << last_pinned
2500           << " interval " << prune_interval
2501           << " last_to_pin " << last_to_pin
2502           << dendl;
2503
2504   // We will be erasing maps as we go.
2505   //
2506   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2507   //
2508   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2509   // we stop pruning. We could prune the maps between `next_to_pin` and
2510   // `last_to_pin`, but by not doing it we end up with neater pruned
2511   // intervals, aligned with `prune_interval`. Besides, this should not be a
2512   // problem as long as `prune_interval` is set to a sane value, instead of
2513   // hundreds or thousands of maps.
2514
2515   auto map_exists = [this](version_t v) {
2516     string k = mon->store->combine_strings("full", v);
2517     return mon->store->exists(get_service_name(), k);
2518   };
2519
2520   // 'interval' represents the number of maps from the last pinned
2521   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2522   // version 11 next; all intermediate versions will be removed.
2523   //
2524   // 'txsize' represents the maximum number of versions we'll be removing in
2525   // this iteration. If 'txsize' is large enough to perform multiple passes
2526   // pinning and removing maps, we will do so; if not, we'll do at least one
2527   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2528   // ensure that we never go *over* the maximum.
2529
2530   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2531   uint64_t removal_interval = prune_interval - 1;
2532
2533   if (txsize < removal_interval) {
2534     dout(5) << __func__
2535             << " setting txsize to removal interval size ("
2536             << removal_interval << " versions"
2537             << dendl;
2538     txsize = removal_interval;
2539   }
2540   ceph_assert(removal_interval > 0);
2541
2542   uint64_t num_pruned = 0;
2543   while (num_pruned + removal_interval <= txsize) {
2544     last_pinned = manifest.get_last_pinned();
2545
2546     if (last_pinned + prune_interval > last_to_pin) {
2547       break;
2548     }
2549     ceph_assert(last_pinned < last_to_pin);
2550
2551     version_t next_pinned = last_pinned + prune_interval;
2552     ceph_assert(next_pinned <= last_to_pin);
2553     manifest.pin(next_pinned);
2554
2555     dout(20) << __func__
2556              << " last_pinned " << last_pinned
2557              << " next_pinned " << next_pinned
2558              << " num_pruned " << num_pruned
2559              << " removal interval (" << (last_pinned+1)
2560              << ".." << (next_pinned-1) << ")"
2561              << " txsize " << txsize << dendl;
2562
2563     ceph_assert(map_exists(last_pinned));
2564     ceph_assert(map_exists(next_pinned));
2565
2566     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2567       ceph_assert(!manifest.is_pinned(v));
2568
2569       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2570       string full_key = mon->store->combine_strings("full", v);
2571       tx->erase(get_service_name(), full_key);
2572       ++num_pruned;
2573     }
2574   }
2575
2576   ceph_assert(num_pruned > 0);
2577
2578   bufferlist bl;
2579   manifest.encode(bl);
2580   tx->put(get_service_name(), "osdmap_manifest", bl);
2581
2582   return true;
2583 }
2584
2585
2586 // -------------
2587
2588 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2589 {
2590   op->mark_osdmon_event(__func__);
2591   Message *m = op->get_req();
2592   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2593
2594   switch (m->get_type()) {
2595     // READs
2596   case MSG_MON_COMMAND:
2597     try {
2598       return preprocess_command(op);
2599     } catch (const bad_cmd_get& e) {
2600       bufferlist bl;
2601       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2602       return true;
2603     }
2604   case CEPH_MSG_MON_GET_OSDMAP:
2605     return preprocess_get_osdmap(op);
2606
2607     // damp updates
2608   case MSG_OSD_MARK_ME_DOWN:
2609     return preprocess_mark_me_down(op);
2610   case MSG_OSD_MARK_ME_DEAD:
2611     return preprocess_mark_me_dead(op);
2612   case MSG_OSD_FULL:
2613     return preprocess_full(op);
2614   case MSG_OSD_FAILURE:
2615     return preprocess_failure(op);
2616   case MSG_OSD_BOOT:
2617     return preprocess_boot(op);
2618   case MSG_OSD_ALIVE:
2619     return preprocess_alive(op);
2620   case MSG_OSD_PG_CREATED:
2621     return preprocess_pg_created(op);
2622   case MSG_OSD_PG_READY_TO_MERGE:
2623     return preprocess_pg_ready_to_merge(op);
2624   case MSG_OSD_PGTEMP:
2625     return preprocess_pgtemp(op);
2626   case MSG_OSD_BEACON:
2627     return preprocess_beacon(op);
2628
2629   case CEPH_MSG_POOLOP:
2630     return preprocess_pool_op(op);
2631
2632   case MSG_REMOVE_SNAPS:
2633     return preprocess_remove_snaps(op);
2634
2635   case MSG_MON_GET_PURGED_SNAPS:
2636     return preprocess_get_purged_snaps(op);
2637
2638   default:
2639     ceph_abort();
2640     return true;
2641   }
2642 }
2643
2644 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2645 {
2646   op->mark_osdmon_event(__func__);
2647   Message *m = op->get_req();
2648   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2649
2650   switch (m->get_type()) {
2651     // damp updates
2652   case MSG_OSD_MARK_ME_DOWN:
2653     return prepare_mark_me_down(op);
2654   case MSG_OSD_MARK_ME_DEAD:
2655     return prepare_mark_me_dead(op);
2656   case MSG_OSD_FULL:
2657     return prepare_full(op);
2658   case MSG_OSD_FAILURE:
2659     return prepare_failure(op);
2660   case MSG_OSD_BOOT:
2661     return prepare_boot(op);
2662   case MSG_OSD_ALIVE:
2663     return prepare_alive(op);
2664   case MSG_OSD_PG_CREATED:
2665     return prepare_pg_created(op);
2666   case MSG_OSD_PGTEMP:
2667     return prepare_pgtemp(op);
2668   case MSG_OSD_PG_READY_TO_MERGE:
2669     return prepare_pg_ready_to_merge(op);
2670   case MSG_OSD_BEACON:
2671     return prepare_beacon(op);
2672
2673   case MSG_MON_COMMAND:
2674     try {
2675       return prepare_command(op);
2676     } catch (const bad_cmd_get& e) {
2677       bufferlist bl;
2678       mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2679       return true;
2680     }
2681
2682   case CEPH_MSG_POOLOP:
2683     return prepare_pool_op(op);
2684
2685   case MSG_REMOVE_SNAPS:
2686     return prepare_remove_snaps(op);
2687
2688
2689   default:
2690     ceph_abort();
2691   }
2692
2693   return false;
2694 }
2695
2696 bool OSDMonitor::should_propose(double& delay)
2697 {
2698   dout(10) << "should_propose" << dendl;
2699
2700   // if full map, propose immediately!  any subsequent changes will be clobbered.
2701   if (pending_inc.fullmap.length())
2702     return true;
2703
2704   // adjust osd weights?
2705   if (!osd_weight.empty() &&
2706       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2707     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2708     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2709     delay = 0.0;
2710     osd_weight.clear();
2711     return true;
2712   }
2713
2714   return PaxosService::should_propose(delay);
2715 }
2716
2717
2718
2719 // ---------------------------
2720 // READs
2721
2722 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2723 {
2724   op->mark_osdmon_event(__func__);
2725   auto m = op->get_req<MMonGetOSDMap>();
2726
2727   uint64_t features = mon->get_quorum_con_features();
2728   if (op->get_session() && op->get_session()->con_features)
2729     features = op->get_session()->con_features;
2730
2731   dout(10) << __func__ << " " << *m << dendl;
2732   MOSDMap *reply = new MOSDMap(mon->monmap->fsid, features);
2733   epoch_t first = get_first_committed();
2734   epoch_t last = osdmap.get_epoch();
2735   int max = g_conf()->osd_map_message_max;
2736   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2737   for (epoch_t e = std::max(first, m->get_full_first());
2738        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2739        ++e, --max) {
2740     bufferlist& bl = reply->maps[e];
2741     int r = get_version_full(e, features, bl);
2742     ceph_assert(r >= 0);
2743     max_bytes -= bl.length();
2744   }
2745   for (epoch_t e = std::max(first, m->get_inc_first());
2746        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2747        ++e, --max) {
2748     bufferlist& bl = reply->incremental_maps[e];
2749     int r = get_version(e, features, bl);
2750     ceph_assert(r >= 0);
2751     max_bytes -= bl.length();
2752   }
2753   reply->oldest_map = first;
2754   reply->newest_map = last;
2755   mon->send_reply(op, reply);
2756   return true;
2757 }
2758
2759
2760 // ---------------------------
2761 // UPDATEs
2762
2763 // failure --
2764
2765 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2766   // check permissions
2767   MonSession *session = op->get_session();
2768   if (!session)
2769     return true;
2770   if (!session->is_capable("osd", MON_CAP_X)) {
2771     dout(0) << "got MOSDFailure from entity with insufficient caps "
2772             << session->caps << dendl;
2773     return true;
2774   }
2775   if (fsid != mon->monmap->fsid) {
2776     dout(0) << "check_source: on fsid " << fsid
2777             << " != " << mon->monmap->fsid << dendl;
2778     return true;
2779   }
2780   return false;
2781 }
2782
2783
2784 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2785 {
2786   op->mark_osdmon_event(__func__);
2787   auto m = op->get_req<MOSDFailure>();
2788   // who is target_osd
2789   int badboy = m->get_target_osd();
2790
2791   // check permissions
2792   if (check_source(op, m->fsid))
2793     goto didit;
2794
2795   // first, verify the reporting host is valid
2796   if (m->get_orig_source().is_osd()) {
2797     int from = m->get_orig_source().num();
2798     if (!osdmap.exists(from) ||
2799         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2800         (osdmap.is_down(from) && m->if_osd_failed())) {
2801       dout(5) << "preprocess_failure from dead osd." << from
2802               << ", ignoring" << dendl;
2803       send_incremental(op, m->get_epoch()+1);
2804       goto didit;
2805     }
2806   }
2807
2808
2809   // weird?
2810   if (osdmap.is_down(badboy)) {
2811     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2812             << " " << m->get_target_addrs()
2813             << ", from " << m->get_orig_source() << dendl;
2814     if (m->get_epoch() < osdmap.get_epoch())
2815       send_incremental(op, m->get_epoch()+1);
2816     goto didit;
2817   }
2818   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2819     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2820             << " " << m->get_target_addrs()
2821             << " != map's " << osdmap.get_addrs(badboy)
2822             << ", from " << m->get_orig_source() << dendl;
2823     if (m->get_epoch() < osdmap.get_epoch())
2824       send_incremental(op, m->get_epoch()+1);
2825     goto didit;
2826   }
2827
2828   // already reported?
2829   if (osdmap.is_down(badboy) ||
2830       osdmap.get_up_from(badboy) > m->get_epoch()) {
2831     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2832             << " " << m->get_target_addrs()
2833             << ", from " << m->get_orig_source() << dendl;
2834     if (m->get_epoch() < osdmap.get_epoch())
2835       send_incremental(op, m->get_epoch()+1);
2836     goto didit;
2837   }
2838
2839   if (!can_mark_down(badboy)) {
2840     dout(5) << "preprocess_failure ignoring report of osd."
2841             << m->get_target_osd() << " " << m->get_target_addrs()
2842             << " from " << m->get_orig_source() << dendl;
2843     goto didit;
2844   }
2845
2846   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2847            << " " << m->get_target_addrs()
2848            << ", from " << m->get_orig_source() << dendl;
2849   return false;
2850
2851  didit:
2852   mon->no_reply(op);
2853   return true;
2854 }
2855
2856 class C_AckMarkedDown : public C_MonOp {
2857   OSDMonitor *osdmon;
2858 public:
2859   C_AckMarkedDown(
2860     OSDMonitor *osdmon,
2861     MonOpRequestRef op)
2862     : C_MonOp(op), osdmon(osdmon) {}
2863
2864   void _finish(int r) override {
2865     if (r == 0) {
2866       auto m = op->get_req<MOSDMarkMeDown>();
2867       osdmon->mon->send_reply(
2868         op,
2869         new MOSDMarkMeDown(
2870           m->fsid,
2871           m->target_osd,
2872           m->target_addrs,
2873           m->get_epoch(),
2874           false));   // ACK itself does not request an ack
2875     } else if (r == -EAGAIN) {
2876         osdmon->dispatch(op);
2877     } else {
2878         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2879     }
2880   }
2881   ~C_AckMarkedDown() override {
2882   }
2883 };
2884
2885 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2886 {
2887   op->mark_osdmon_event(__func__);
2888   auto m = op->get_req<MOSDMarkMeDown>();
2889   int from = m->target_osd;
2890
2891   // check permissions
2892   if (check_source(op, m->fsid))
2893     goto reply;
2894
2895   // first, verify the reporting host is valid
2896   if (!m->get_orig_source().is_osd())
2897     goto reply;
2898
2899   if (!osdmap.exists(from) ||
2900       osdmap.is_down(from) ||
2901       osdmap.get_addrs(from) != m->target_addrs) {
2902     dout(5) << "preprocess_mark_me_down from dead osd."
2903             << from << ", ignoring" << dendl;
2904     send_incremental(op, m->get_epoch()+1);
2905     goto reply;
2906   }
2907
2908   // no down might be set
2909   if (!can_mark_down(from))
2910     goto reply;
2911
2912   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
2913            << " " << m->target_addrs << dendl;
2914   return false;
2915
2916  reply:
2917   if (m->request_ack) {
2918     Context *c(new C_AckMarkedDown(this, op));
2919     c->complete(0);
2920   }
2921   return true;
2922 }
2923
2924 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
2925 {
2926   op->mark_osdmon_event(__func__);
2927   auto m = op->get_req<MOSDMarkMeDown>();
2928   int target_osd = m->target_osd;
2929
2930   ceph_assert(osdmap.is_up(target_osd));
2931   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
2932
2933   mon->clog->info() << "osd." << target_osd << " marked itself down";
2934   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
2935   if (m->request_ack)
2936     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
2937   return true;
2938 }
2939
2940 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
2941 {
2942   op->mark_osdmon_event(__func__);
2943   auto m = op->get_req<MOSDMarkMeDead>();
2944   int from = m->target_osd;
2945
2946   // check permissions
2947   if (check_source(op, m->fsid)) {
2948     mon->no_reply(op);
2949     return true;
2950   }
2951
2952   // first, verify the reporting host is valid
2953   if (!m->get_orig_source().is_osd()) {
2954     mon->no_reply(op);
2955     return true;
2956   }
2957
2958   if (!osdmap.exists(from) ||
2959       !osdmap.is_down(from)) {
2960     dout(5) << __func__ << " from nonexistent or up osd." << from
2961             << ", ignoring" << dendl;
2962     send_incremental(op, m->get_epoch()+1);
2963     mon->no_reply(op);
2964     return true;
2965   }
2966
2967   return false;
2968 }
2969
2970 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
2971 {
2972   op->mark_osdmon_event(__func__);
2973   auto m = op->get_req<MOSDMarkMeDead>();
2974   int target_osd = m->target_osd;
2975
2976   ceph_assert(osdmap.is_down(target_osd));
2977
2978   mon->clog->info() << "osd." << target_osd << " marked itself dead as of e"
2979                     << m->get_epoch();
2980   if (!pending_inc.new_xinfo.count(target_osd)) {
2981     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
2982   }
2983   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
2984   wait_for_finished_proposal(
2985     op,
2986     new LambdaContext(
2987       [op, this] (int r) {
2988         if (r >= 0) {
2989           mon->no_reply(op);      // ignore on success
2990         }
2991       }
2992       ));
2993   return true;
2994 }
2995
2996 bool OSDMonitor::can_mark_down(int i)
2997 {
2998   if (osdmap.is_nodown(i)) {
2999     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3000             << "will not mark it down" << dendl;
3001     return false;
3002   }
3003
3004   int num_osds = osdmap.get_num_osds();
3005   if (num_osds == 0) {
3006     dout(5) << __func__ << " no osds" << dendl;
3007     return false;
3008   }
3009   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3010   float up_ratio = (float)up / (float)num_osds;
3011   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3012     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3013             << g_conf()->mon_osd_min_up_ratio
3014             << ", will not mark osd." << i << " down" << dendl;
3015     return false;
3016   }
3017   return true;
3018 }
3019
3020 bool OSDMonitor::can_mark_up(int i)
3021 {
3022   if (osdmap.is_noup(i)) {
3023     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3024             << "will not mark it up" << dendl;
3025     return false;
3026   }
3027
3028   return true;
3029 }
3030
3031 /**
3032  * @note the parameter @p i apparently only exists here so we can output the
3033  *       osd's id on messages.
3034  */
3035 bool OSDMonitor::can_mark_out(int i)
3036 {
3037   if (osdmap.is_noout(i)) {
3038     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3039             << "will not mark it out" << dendl;
3040     return false;
3041   }
3042
3043   int num_osds = osdmap.get_num_osds();
3044   if (num_osds == 0) {
3045     dout(5) << __func__ << " no osds" << dendl;
3046     return false;
3047   }
3048   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3049   float in_ratio = (float)in / (float)num_osds;
3050   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3051     if (i >= 0)
3052       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3053               << g_conf()->mon_osd_min_in_ratio
3054               << ", will not mark osd." << i << " out" << dendl;
3055     else
3056       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3057               << g_conf()->mon_osd_min_in_ratio
3058               << ", will not mark osds out" << dendl;
3059     return false;
3060   }
3061
3062   return true;
3063 }
3064
3065 bool OSDMonitor::can_mark_in(int i)
3066 {
3067   if (osdmap.is_noin(i)) {
3068     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3069             << "will not mark it in" << dendl;
3070     return false;
3071   }
3072
3073   return true;
3074 }
3075
3076 bool OSDMonitor::check_failures(utime_t now)
3077 {
3078   bool found_failure = false;
3079   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3080        p != failure_info.end();
3081        ++p) {
3082     if (can_mark_down(p->first)) {
3083       found_failure |= check_failure(now, p->first, p->second);
3084     }
3085   }
3086   return found_failure;
3087 }
3088
3089 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3090 {
3091   // already pending failure?
3092   if (pending_inc.new_state.count(target_osd) &&
3093       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3094     dout(10) << " already pending failure" << dendl;
3095     return true;
3096   }
3097
3098   set<string> reporters_by_subtree;
3099   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3100   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3101   utime_t max_failed_since = fi.get_failed_since();
3102   utime_t failed_for = now - max_failed_since;
3103
3104   utime_t grace = orig_grace;
3105   double my_grace = 0, peer_grace = 0;
3106   double decay_k = 0;
3107   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3108     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3109     decay_k = ::log(.5) / halflife;
3110
3111     // scale grace period based on historical probability of 'lagginess'
3112     // (false positive failures due to slowness).
3113     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3114     double decay = exp((double)failed_for * decay_k);
3115     dout(20) << " halflife " << halflife << " decay_k " << decay_k
3116              << " failed_for " << failed_for << " decay " << decay << dendl;
3117     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3118     grace += my_grace;
3119   }
3120
3121   // consider the peers reporting a failure a proxy for a potential
3122   // 'subcluster' over the overall cluster that is similarly
3123   // laggy.  this is clearly not true in all cases, but will sometimes
3124   // help us localize the grace correction to a subset of the system
3125   // (say, a rack with a bad switch) that is unhappy.
3126   ceph_assert(fi.reporters.size());
3127   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3128     // get the parent bucket whose type matches with "reporter_subtree_level".
3129     // fall back to OSD if the level doesn't exist.
3130     if (osdmap.exists(p->first)) {
3131       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3132       if (auto iter = reporter_loc.find(reporter_subtree_level);
3133           iter == reporter_loc.end()) {
3134         reporters_by_subtree.insert("osd." + to_string(p->first));
3135       } else {
3136         reporters_by_subtree.insert(iter->second);
3137       }
3138       if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3139         const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3140         utime_t elapsed = now - xi.down_stamp;
3141         double decay = exp((double)elapsed * decay_k);
3142         peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3143       }
3144       ++p;
3145     } else {
3146       fi.cancel_report(p->first);;
3147       p = fi.reporters.erase(p);
3148     }
3149   }
3150
3151   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3152     peer_grace /= (double)fi.reporters.size();
3153     grace += peer_grace;
3154   }
3155
3156   dout(10) << " osd." << target_osd << " has "
3157            << fi.reporters.size() << " reporters, "
3158            << grace << " grace (" << orig_grace << " + " << my_grace
3159            << " + " << peer_grace << "), max_failed_since " << max_failed_since
3160            << dendl;
3161
3162   if (failed_for >= grace &&
3163       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3164     dout(1) << " we have enough reporters to mark osd." << target_osd
3165             << " down" << dendl;
3166     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3167
3168     mon->clog->info() << "osd." << target_osd << " failed ("
3169                       << osdmap.crush->get_full_location_ordered_string(
3170                         target_osd)
3171                       << ") ("
3172                       << (int)reporters_by_subtree.size()
3173                       << " reporters from different "
3174                       << reporter_subtree_level << " after "
3175                       << failed_for << " >= grace " << grace << ")";
3176     return true;
3177   }
3178   return false;
3179 }
3180
3181 void OSDMonitor::force_failure(int target_osd, int by)
3182 {
3183   // already pending failure?
3184   if (pending_inc.new_state.count(target_osd) &&
3185       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3186     dout(10) << " already pending failure" << dendl;
3187     return;
3188   }
3189
3190   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3191   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3192   if (!pending_inc.new_xinfo.count(target_osd)) {
3193     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3194   }
3195   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3196
3197   mon->clog->info() << "osd." << target_osd << " failed ("
3198                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3199                     << ") (connection refused reported by osd." << by << ")";
3200   return;
3201 }
3202
3203 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3204 {
3205   op->mark_osdmon_event(__func__);
3206   auto m = op->get_req<MOSDFailure>();
3207   dout(1) << "prepare_failure osd." << m->get_target_osd()
3208           << " " << m->get_target_addrs()
3209           << " from " << m->get_orig_source()
3210           << " is reporting failure:" << m->if_osd_failed() << dendl;
3211
3212   int target_osd = m->get_target_osd();
3213   int reporter = m->get_orig_source().num();
3214   ceph_assert(osdmap.is_up(target_osd));
3215   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3216
3217   mon->no_reply(op);
3218
3219   if (m->if_osd_failed()) {
3220     // calculate failure time
3221     utime_t now = ceph_clock_now();
3222     utime_t failed_since =
3223       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3224
3225     // add a report
3226     if (m->is_immediate()) {
3227       mon->clog->debug() << "osd." << m->get_target_osd()
3228                          << " reported immediately failed by "
3229                          << m->get_orig_source();
3230       force_failure(target_osd, reporter);
3231       return true;
3232     }
3233     mon->clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3234                       << m->get_orig_source();
3235
3236     failure_info_t& fi = failure_info[target_osd];
3237     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3238     if (old_op) {
3239       mon->no_reply(old_op);
3240     }
3241
3242     return check_failure(now, target_osd, fi);
3243   } else {
3244     // remove the report
3245     mon->clog->debug() << "osd." << m->get_target_osd()
3246                        << " failure report canceled by "
3247                        << m->get_orig_source();
3248     if (failure_info.count(target_osd)) {
3249       failure_info_t& fi = failure_info[target_osd];
3250       MonOpRequestRef report_op = fi.cancel_report(reporter);
3251       if (report_op) {
3252         mon->no_reply(report_op);
3253       }
3254       if (fi.reporters.empty()) {
3255         dout(10) << " removing last failure_info for osd." << target_osd
3256                  << dendl;
3257         failure_info.erase(target_osd);
3258       } else {
3259         dout(10) << " failure_info for osd." << target_osd << " now "
3260                  << fi.reporters.size() << " reporters" << dendl;
3261       }
3262     } else {
3263       dout(10) << " no failure_info for osd." << target_osd << dendl;
3264     }
3265   }
3266
3267   return false;
3268 }
3269
3270 void OSDMonitor::process_failures()
3271 {
3272   map<int,failure_info_t>::iterator p = failure_info.begin();
3273   while (p != failure_info.end()) {
3274     if (osdmap.is_up(p->first)) {
3275       ++p;
3276     } else {
3277       dout(10) << "process_failures osd." << p->first << dendl;
3278       list<MonOpRequestRef> ls;
3279       p->second.take_report_messages(ls);
3280       failure_info.erase(p++);
3281
3282       while (!ls.empty()) {
3283         MonOpRequestRef o = ls.front();
3284         if (o) {
3285           o->mark_event(__func__);
3286           MOSDFailure *m = o->get_req<MOSDFailure>();
3287           send_latest(o, m->get_epoch());
3288           mon->no_reply(o);
3289         }
3290         ls.pop_front();
3291       }
3292     }
3293   }
3294 }
3295
3296 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3297 {
3298   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3299
3300   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3301        p != failure_info.end();
3302        ++p) {
3303     p->second.take_report_messages(ls);
3304   }
3305   failure_info.clear();
3306 }
3307
3308
3309 // boot --
3310
3311 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3312 {
3313   op->mark_osdmon_event(__func__);
3314   auto m = op->get_req<MOSDBoot>();
3315   int from = m->get_orig_source_inst().name.num();
3316
3317   // check permissions, ignore if failed (no response expected)
3318   MonSession *session = op->get_session();
3319   if (!session)
3320     goto ignore;
3321   if (!session->is_capable("osd", MON_CAP_X)) {
3322     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3323             << session->caps << dendl;
3324     goto ignore;
3325   }
3326
3327   if (m->sb.cluster_fsid != mon->monmap->fsid) {
3328     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3329             << " != " << mon->monmap->fsid << dendl;
3330     goto ignore;
3331   }
3332
3333   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3334     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3335     goto ignore;
3336   }
3337
3338   ceph_assert(m->get_orig_source_inst().name.is_osd());
3339
3340   // force all osds to have gone through luminous prior to upgrade to nautilus
3341   {
3342     vector<string> missing;
3343     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3344       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3345     }
3346     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3347       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3348     }
3349     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3350       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3351     }
3352     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3353       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3354     }
3355
3356     if (!missing.empty()) {
3357       using std::experimental::make_ostream_joiner;
3358
3359       stringstream ss;
3360       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3361
3362       mon->clog->info() << "disallowing boot of OSD "
3363                         << m->get_orig_source_inst()
3364                         << " because the osd lacks " << ss.str();
3365       goto ignore;
3366     }
3367   }
3368
3369   // make sure osd versions do not span more than 3 releases
3370   if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3371       osdmap.require_osd_release < ceph_release_t::mimic) {
3372     mon->clog->info() << "disallowing boot of octopus+ OSD "
3373                       << m->get_orig_source_inst()
3374                       << " because require_osd_release < mimic";
3375     goto ignore;
3376   }
3377
3378   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3379   // we are reusing a jewel feature bit that was retired in luminous.
3380   if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3381       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3382       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3383     mon->clog->info() << "disallowing boot of OSD "
3384                       << m->get_orig_source_inst()
3385                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3386     goto ignore;
3387   }
3388
3389   // already booted?
3390   if (osdmap.is_up(from) &&
3391       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3392       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3393     // yup.
3394     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3395             << " " << m->get_orig_source_addrs()
3396             << " =~ " << osdmap.get_addrs(from) << dendl;
3397     _booted(op, false);
3398     return true;
3399   }
3400
3401   if (osdmap.exists(from) &&
3402       !osdmap.get_uuid(from).is_zero() &&
3403       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3404     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3405             << " clashes with existing osd: different fsid"
3406             << " (ours: " << osdmap.get_uuid(from)
3407             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3408     goto ignore;
3409   }
3410
3411   if (osdmap.exists(from) &&
3412       osdmap.get_info(from).up_from > m->version &&
3413       osdmap.get_most_recent_addrs(from).legacy_equals(
3414         m->get_orig_source_addrs())) {
3415     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3416     send_latest(op, m->sb.current_epoch+1);
3417     return true;
3418   }
3419
3420   // noup?
3421   if (!can_mark_up(from)) {
3422     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3423     send_latest(op, m->sb.current_epoch+1);
3424     return true;
3425   }
3426
3427   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3428   return false;
3429
3430  ignore:
3431   return true;
3432 }
3433
3434 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3435 {
3436   op->mark_osdmon_event(__func__);
3437   auto m = op->get_req<MOSDBoot>();
3438   dout(7) << __func__ << " from " << m->get_source()
3439           << " sb " << m->sb
3440           << " client_addrs" << m->get_connection()->get_peer_addrs()
3441           << " cluster_addrs " << m->cluster_addrs
3442           << " hb_back_addrs " << m->hb_back_addrs
3443           << " hb_front_addrs " << m->hb_front_addrs
3444           << dendl;
3445
3446   ceph_assert(m->get_orig_source().is_osd());
3447   int from = m->get_orig_source().num();
3448
3449   // does this osd exist?
3450   if (from >= osdmap.get_max_osd()) {
3451     dout(1) << "boot from osd." << from << " >= max_osd "
3452             << osdmap.get_max_osd() << dendl;
3453     return false;
3454   }
3455
3456   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3457   if (pending_inc.new_state.count(from))
3458     oldstate ^= pending_inc.new_state[from];
3459
3460   // already up?  mark down first?
3461   if (osdmap.is_up(from)) {
3462     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3463             << osdmap.get_addrs(from) << dendl;
3464     // preprocess should have caught these;  if not, assert.
3465     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3466                   m->get_orig_source_addrs()) ||
3467                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3468     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3469
3470     if (pending_inc.new_state.count(from) == 0 ||
3471         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3472       // mark previous guy down
3473       pending_inc.new_state[from] = CEPH_OSD_UP;
3474     }
3475     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3476   } else if (pending_inc.new_up_client.count(from)) {
3477     // already prepared, just wait
3478     dout(7) << __func__ << " already prepared, waiting on "
3479             << m->get_orig_source_addr() << dendl;
3480     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3481   } else {
3482     // mark new guy up.
3483     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3484     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3485     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3486     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3487
3488     down_pending_out.erase(from);  // if any
3489
3490     if (m->sb.weight)
3491       osd_weight[from] = m->sb.weight;
3492
3493     // set uuid?
3494     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3495              << dendl;
3496     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3497       // preprocess should have caught this;  if not, assert.
3498       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3499       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3500     }
3501
3502     // fresh osd?
3503     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3504       const osd_info_t& i = osdmap.get_info(from);
3505       if (i.up_from > i.lost_at) {
3506         dout(10) << " fresh osd; marking lost_at too" << dendl;
3507         pending_inc.new_lost[from] = osdmap.get_epoch();
3508       }
3509     }
3510
3511     // metadata
3512     bufferlist osd_metadata;
3513     encode(m->metadata, osd_metadata);
3514     pending_metadata[from] = osd_metadata;
3515     pending_metadata_rm.erase(from);
3516
3517     // adjust last clean unmount epoch?
3518     const osd_info_t& info = osdmap.get_info(from);
3519     dout(10) << " old osd_info: " << info << dendl;
3520     if (m->sb.mounted > info.last_clean_begin ||
3521         (m->sb.mounted == info.last_clean_begin &&
3522          m->sb.clean_thru > info.last_clean_end)) {
3523       epoch_t begin = m->sb.mounted;
3524       epoch_t end = m->sb.clean_thru;
3525
3526       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3527                << "[" << info.last_clean_begin << "," << info.last_clean_end
3528                << ") -> [" << begin << "-" << end << ")"
3529                << dendl;
3530       pending_inc.new_last_clean_interval[from] =
3531         pair<epoch_t,epoch_t>(begin, end);
3532     }
3533
3534     if (pending_inc.new_xinfo.count(from) == 0)
3535       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3536     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3537     if (m->boot_epoch == 0) {
3538       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3539       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3540       dout(10) << " not laggy, new xi " << xi << dendl;
3541     } else {
3542       if (xi.down_stamp.sec()) {
3543         int interval = ceph_clock_now().sec() -
3544           xi.down_stamp.sec();
3545         if (g_conf()->mon_osd_laggy_max_interval &&
3546             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3547           interval =  g_conf()->mon_osd_laggy_max_interval;
3548         }
3549         xi.laggy_interval =
3550           interval * g_conf()->mon_osd_laggy_weight +
3551           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3552       }
3553       xi.laggy_probability =
3554         g_conf()->mon_osd_laggy_weight +
3555         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3556       dout(10) << " laggy, now xi " << xi << dendl;
3557     }
3558
3559     // set features shared by the osd
3560     if (m->osd_features)
3561       xi.features = m->osd_features;
3562     else
3563       xi.features = m->get_connection()->get_features();
3564
3565     // mark in?
3566     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3567          (oldstate & CEPH_OSD_AUTOOUT)) ||
3568         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3569         (g_conf()->mon_osd_auto_mark_in)) {
3570       if (can_mark_in(from)) {
3571         if (xi.old_weight > 0) {
3572           pending_inc.new_weight[from] = xi.old_weight;
3573           xi.old_weight = 0;
3574         } else {
3575           pending_inc.new_weight[from] = CEPH_OSD_IN;
3576         }
3577       } else {
3578         dout(7) << __func__ << " NOIN set, will not mark in "
3579                 << m->get_orig_source_addr() << dendl;
3580       }
3581     }
3582
3583     // wait
3584     wait_for_finished_proposal(op, new C_Booted(this, op));
3585   }
3586   return true;
3587 }
3588
3589 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3590 {
3591   op->mark_osdmon_event(__func__);
3592   auto m = op->get_req<MOSDBoot>();
3593   dout(7) << "_booted " << m->get_orig_source_inst()
3594           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3595
3596   if (logit) {
3597     mon->clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3598                       << " boot";
3599   }
3600
3601   send_latest(op, m->sb.current_epoch+1);
3602 }
3603
3604
3605 // -------------
3606 // full
3607
3608 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3609 {
3610   op->mark_osdmon_event(__func__);
3611   auto m = op->get_req<MOSDFull>();
3612   int from = m->get_orig_source().num();
3613   set<string> state;
3614   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3615
3616   // check permissions, ignore if failed
3617   MonSession *session = op->get_session();
3618   if (!session)
3619     goto ignore;
3620   if (!session->is_capable("osd", MON_CAP_X)) {
3621     dout(0) << "MOSDFull from entity with insufficient privileges:"
3622             << session->caps << dendl;
3623     goto ignore;
3624   }
3625
3626   // ignore a full message from the osd instance that already went down
3627   if (!osdmap.exists(from)) {
3628     dout(7) << __func__ << " ignoring full message from nonexistent "
3629             << m->get_orig_source_inst() << dendl;
3630     goto ignore;
3631   }
3632   if ((!osdmap.is_up(from) &&
3633        osdmap.get_most_recent_addrs(from).legacy_equals(
3634          m->get_orig_source_addrs())) ||
3635       (osdmap.is_up(from) &&
3636        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3637     dout(7) << __func__ << " ignoring full message from down "
3638             << m->get_orig_source_inst() << dendl;
3639     goto ignore;
3640   }
3641
3642   OSDMap::calc_state_set(osdmap.get_state(from), state);
3643
3644   if ((osdmap.get_state(from) & mask) == m->state) {
3645     dout(7) << __func__ << " state already " << state << " for osd." << from
3646             << " " << m->get_orig_source_inst() << dendl;
3647     _reply_map(op, m->version);
3648     goto ignore;
3649   }
3650
3651   dout(10) << __func__ << " want state " << state << " for osd." << from
3652            << " " << m->get_orig_source_inst() << dendl;
3653   return false;
3654
3655  ignore:
3656   return true;
3657 }
3658
3659 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3660 {
3661   op->mark_osdmon_event(__func__);
3662   auto m = op->get_req<MOSDFull>();
3663   const int from = m->get_orig_source().num();
3664
3665   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3666   const unsigned want_state = m->state & mask;  // safety first
3667
3668   unsigned cur_state = osdmap.get_state(from);
3669   auto p = pending_inc.new_state.find(from);
3670   if (p != pending_inc.new_state.end()) {
3671     cur_state ^= p->second;
3672   }
3673   cur_state &= mask;
3674
3675   set<string> want_state_set, cur_state_set;
3676   OSDMap::calc_state_set(want_state, want_state_set);
3677   OSDMap::calc_state_set(cur_state, cur_state_set);
3678
3679   if (cur_state != want_state) {
3680     if (p != pending_inc.new_state.end()) {
3681       p->second &= ~mask;
3682     } else {
3683       pending_inc.new_state[from] = 0;
3684     }
3685     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3686     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3687             << " -> " << want_state_set << dendl;
3688   } else {
3689     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3690             << " = wanted " << want_state_set << ", just waiting" << dendl;
3691   }
3692
3693   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3694   return true;
3695 }
3696
3697 // -------------
3698 // alive
3699
3700 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3701 {
3702   op->mark_osdmon_event(__func__);
3703   auto m = op->get_req<MOSDAlive>();
3704   int from = m->get_orig_source().num();
3705
3706   // check permissions, ignore if failed
3707   MonSession *session = op->get_session();
3708   if (!session)
3709     goto ignore;
3710   if (!session->is_capable("osd", MON_CAP_X)) {
3711     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3712             << session->caps << dendl;
3713     goto ignore;
3714   }
3715
3716   if (!osdmap.is_up(from) ||
3717       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3718     dout(7) << "preprocess_alive ignoring alive message from down "
3719             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3720             << dendl;
3721     goto ignore;
3722   }
3723
3724   if (osdmap.get_up_thru(from) >= m->want) {
3725     // yup.
3726     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3727     _reply_map(op, m->version);
3728     return true;
3729   }
3730
3731   dout(10) << "preprocess_alive want up_thru " << m->want
3732            << " from " << m->get_orig_source_inst() << dendl;
3733   return false;
3734
3735  ignore:
3736   return true;
3737 }
3738
3739 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3740 {
3741   op->mark_osdmon_event(__func__);
3742   auto m = op->get_req<MOSDAlive>();
3743   int from = m->get_orig_source().num();
3744
3745   if (0) {  // we probably don't care much about these
3746     mon->clog->debug() << m->get_orig_source_inst() << " alive";
3747   }
3748
3749   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3750           << " from " << m->get_orig_source_inst() << dendl;
3751
3752   update_up_thru(from, m->version); // set to the latest map the OSD has
3753   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3754   return true;
3755 }
3756
3757 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3758 {
3759   op->mark_osdmon_event(__func__);
3760   dout(7) << "_reply_map " << e
3761           << " from " << op->get_req()->get_orig_source_inst()
3762           << dendl;
3763   send_latest(op, e);
3764 }
3765
3766 // pg_created
3767 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3768 {
3769   op->mark_osdmon_event(__func__);
3770   auto m  = op->get_req<MOSDPGCreated>();
3771   dout(10) << __func__ << " " << *m << dendl;
3772   auto session = op->get_session();
3773   mon->no_reply(op);
3774   if (!session) {
3775     dout(10) << __func__ << ": no monitor session!" << dendl;
3776     return true;
3777   }
3778   if (!session->is_capable("osd", MON_CAP_X)) {
3779     derr << __func__ << " received from entity "
3780          << "with insufficient privileges " << session->caps << dendl;
3781     return true;
3782   }
3783   // always forward the "created!" to the leader
3784   return false;
3785 }
3786
3787 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3788 {
3789   op->mark_osdmon_event(__func__);
3790   auto m = op->get_req<MOSDPGCreated>();
3791   dout(10) << __func__ << " " << *m << dendl;
3792   auto src = m->get_orig_source();
3793   auto from = src.num();
3794   if (!src.is_osd() ||
3795       !mon->osdmon()->osdmap.is_up(from) ||
3796       !mon->osdmon()->osdmap.get_addrs(from).legacy_equals(
3797         m->get_orig_source_addrs())) {
3798     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3799     return false;
3800   }
3801   pending_created_pgs.push_back(m->pgid);
3802   return true;
3803 }
3804
3805 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3806 {
3807   op->mark_osdmon_event(__func__);
3808   auto m = op->get_req<MOSDPGReadyToMerge>();
3809   dout(10) << __func__ << " " << *m << dendl;
3810   const pg_pool_t *pi;
3811   auto session = op->get_session();
3812   if (!session) {
3813     dout(10) << __func__ << ": no monitor session!" << dendl;
3814     goto ignore;
3815   }
3816   if (!session->is_capable("osd", MON_CAP_X)) {
3817     derr << __func__ << " received from entity "
3818          << "with insufficient privileges " << session->caps << dendl;
3819     goto ignore;
3820   }
3821   pi = osdmap.get_pg_pool(m->pgid.pool());
3822   if (!pi) {
3823     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3824     goto ignore;
3825   }
3826   if (pi->get_pg_num() <= m->pgid.ps()) {
3827     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3828     goto ignore;
3829   }
3830   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3831     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3832     goto ignore;
3833   }
3834   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3835     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3836     goto ignore;
3837   }
3838   return false;
3839
3840  ignore:
3841   mon->no_reply(op);
3842   return true;
3843 }
3844
3845 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3846 {
3847   op->mark_osdmon_event(__func__);
3848   auto m  = op->get_req<MOSDPGReadyToMerge>();
3849   dout(10) << __func__ << " " << *m << dendl;
3850   pg_pool_t p;
3851   if (pending_inc.new_pools.count(m->pgid.pool()))
3852     p = pending_inc.new_pools[m->pgid.pool()];
3853   else
3854     p = *osdmap.get_pg_pool(m->pgid.pool());
3855   if (p.get_pg_num() != m->pgid.ps() + 1 ||
3856       p.get_pg_num_pending() > m->pgid.ps()) {
3857     dout(10) << __func__
3858              << " race with concurrent pg_num[_pending] update, will retry"
3859              << dendl;
3860     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3861     return true;
3862   }
3863
3864   if (m->ready) {
3865     p.dec_pg_num(m->pgid,
3866                  pending_inc.epoch,
3867                  m->source_version,
3868                  m->target_version,
3869                  m->last_epoch_started,
3870                  m->last_epoch_clean);
3871     p.last_change = pending_inc.epoch;
3872   } else {
3873     // back off the merge attempt!
3874     p.set_pg_num_pending(p.get_pg_num());
3875   }
3876
3877   // force pre-nautilus clients to resend their ops, since they
3878   // don't understand pg_num_pending changes form a new interval
3879   p.last_force_op_resend_prenautilus = pending_inc.epoch;
3880
3881   pending_inc.new_pools[m->pgid.pool()] = p;
3882
3883   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
3884   if (m->ready &&
3885       prob > 0 &&
3886       prob > (double)(rand() % 1000)/1000.0) {
3887     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
3888     auto n = new MMonCommand(mon->monmap->get_fsid());
3889     n->set_connection(m->get_connection());
3890     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
3891                osdmap.get_pool_name(m->pgid.pool()) +
3892                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
3893                stringify(m->pgid.ps() + 1) + "\"}" };
3894     MonOpRequestRef nop = mon->op_tracker.create_request<MonOpRequest>(n);
3895     nop->set_type_service();
3896     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
3897   } else {
3898     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3899   }
3900   return true;
3901 }
3902
3903
3904 // -------------
3905 // pg_temp changes
3906
3907 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
3908 {
3909   auto m = op->get_req<MOSDPGTemp>();
3910   dout(10) << "preprocess_pgtemp " << *m << dendl;
3911   mempool::osdmap::vector<int> empty;
3912   int from = m->get_orig_source().num();
3913   size_t ignore_cnt = 0;
3914
3915   // check caps
3916   MonSession *session = op->get_session();
3917   if (!session)
3918     goto ignore;
3919   if (!session->is_capable("osd", MON_CAP_X)) {
3920     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
3921             << session->caps << dendl;
3922     goto ignore;
3923   }
3924
3925   if (!osdmap.is_up(from) ||
3926       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3927     dout(7) << "ignoring pgtemp message from down "
3928             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3929             << dendl;
3930     goto ignore;
3931   }
3932
3933   if (m->forced) {
3934     return false;
3935   }
3936
3937   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
3938     dout(20) << " " << p->first
3939              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
3940              << " -> " << p->second << dendl;
3941
3942     // does the pool exist?
3943     if (!osdmap.have_pg_pool(p->first.pool())) {
3944       /*
3945        * 1. If the osdmap does not have the pool, it means the pool has been
3946        *    removed in-between the osd sending this message and us handling it.
3947        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
3948        *    not exist in the pending either, as the osds would not send a
3949        *    message about a pool they know nothing about (yet).
3950        * 3. However, if the pool does exist in the pending, then it must be a
3951        *    new pool, and not relevant to this message (see 1).
3952        */
3953       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3954                << ": pool has been removed" << dendl;
3955       ignore_cnt++;
3956       continue;
3957     }
3958
3959     int acting_primary = -1;
3960     osdmap.pg_to_up_acting_osds(
3961       p->first, nullptr, nullptr, nullptr, &acting_primary);
3962     if (acting_primary != from) {
3963       /* If the source isn't the primary based on the current osdmap, we know
3964        * that the interval changed and that we can discard this message.
3965        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
3966        * which of two pg temp mappings on the same pg is more recent.
3967        */
3968       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
3969                << ": primary has changed" << dendl;
3970       ignore_cnt++;
3971       continue;
3972     }
3973
3974     // removal?
3975     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
3976                               osdmap.primary_temp->count(p->first)))
3977       return false;
3978     // change?
3979     //  NOTE: we assume that this will clear pg_primary, so consider
3980     //        an existing pg_primary field to imply a change
3981     if (p->second.size() &&
3982         (osdmap.pg_temp->count(p->first) == 0 ||
3983          osdmap.pg_temp->get(p->first) != p->second ||
3984          osdmap.primary_temp->count(p->first)))
3985       return false;
3986   }
3987
3988   // should we ignore all the pgs?
3989   if (ignore_cnt == m->pg_temp.size())
3990     goto ignore;
3991
3992   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
3993   _reply_map(op, m->map_epoch);
3994   return true;
3995
3996  ignore:
3997   return true;
3998 }
3999
4000 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4001 {
4002   epoch_t old_up_thru = osdmap.get_up_thru(from);
4003   auto ut = pending_inc.new_up_thru.find(from);
4004   if (ut != pending_inc.new_up_thru.end()) {
4005     old_up_thru = ut->second;
4006   }
4007   if (up_thru > old_up_thru) {
4008     // set up_thru too, so the osd doesn't have to ask again
4009     pending_inc.new_up_thru[from] = up_thru;
4010   }
4011 }
4012
4013 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4014 {
4015   op->mark_osdmon_event(__func__);
4016   auto m = op->get_req<MOSDPGTemp>();
4017   int from = m->get_orig_source().num();
4018   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4019   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4020     uint64_t pool = p->first.pool();
4021     if (pending_inc.old_pools.count(pool)) {
4022       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4023                << ": pool pending removal" << dendl;
4024       continue;
4025     }
4026     if (!osdmap.have_pg_pool(pool)) {
4027       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4028                << ": pool has been removed" << dendl;
4029       continue;
4030     }
4031     pending_inc.new_pg_temp[p->first] =
4032       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4033
4034     // unconditionally clear pg_primary (until this message can encode
4035     // a change for that, too.. at which point we need to also fix
4036     // preprocess_pg_temp)
4037     if (osdmap.primary_temp->count(p->first) ||
4038         pending_inc.new_primary_temp.count(p->first))
4039       pending_inc.new_primary_temp[p->first] = -1;
4040   }
4041
4042   // set up_thru too, so the osd doesn't have to ask again
4043   update_up_thru(from, m->map_epoch);
4044
4045   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4046   return true;
4047 }
4048
4049
4050 // ---
4051
4052 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4053 {
4054   op->mark_osdmon_event(__func__);
4055   auto m = op->get_req<MRemoveSnaps>();
4056   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4057
4058   // check privilege, ignore if failed
4059   MonSession *session = op->get_session();
4060   mon->no_reply(op);
4061   if (!session)
4062     goto ignore;
4063   if (!session->caps.is_capable(
4064         cct,
4065         session->entity_name,
4066         "osd", "osd pool rmsnap", {}, true, true, false,
4067         session->get_peer_socket_addr())) {
4068     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4069             << session->caps << dendl;
4070     goto ignore;
4071   }
4072
4073   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4074        q != m->snaps.end();
4075        ++q) {
4076     if (!osdmap.have_pg_pool(q->first)) {
4077       dout(10) << " ignoring removed_snaps " << q->second
4078                << " on non-existent pool " << q->first << dendl;
4079       continue;
4080     }
4081     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4082     for (vector<snapid_t>::iterator p = q->second.begin();
4083          p != q->second.end();
4084          ++p) {
4085       if (*p > pi->get_snap_seq() ||
4086           !_is_removed_snap(q->first, *p)) {
4087         return false;
4088       }
4089     }
4090   }
4091
4092   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4093     auto reply = make_message<MRemoveSnaps>();
4094     reply->snaps = m->snaps;
4095     mon->send_reply(op, reply.detach());
4096   }
4097
4098  ignore:
4099   return true;
4100 }
4101
4102 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4103 {
4104   op->mark_osdmon_event(__func__);
4105   auto m = op->get_req<MRemoveSnaps>();
4106   dout(7) << "prepare_remove_snaps " << *m << dendl;
4107
4108   for (auto& [pool, snaps] : m->snaps) {
4109     if (!osdmap.have_pg_pool(pool)) {
4110       dout(10) << " ignoring removed_snaps " << snaps
4111                << " on non-existent pool " << pool << dendl;
4112       continue;
4113     }
4114
4115     pg_pool_t& pi = osdmap.pools[pool];
4116     for (auto s : snaps) {
4117       if (!_is_removed_snap(pool, s) &&
4118           (!pending_inc.new_pools.count(pool) ||
4119            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4120           (!pending_inc.new_removed_snaps.count(pool) ||
4121            !pending_inc.new_removed_snaps[pool].contains(s))) {
4122         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4123         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4124           newpi->removed_snaps.insert(s);
4125           dout(10) << " pool " << pool << " removed_snaps added " << s
4126                    << " (now " << newpi->removed_snaps << ")" << dendl;
4127         }
4128         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4129         if (s > newpi->get_snap_seq()) {
4130           dout(10) << " pool " << pool << " snap_seq "
4131                    << newpi->get_snap_seq() << " -> " << s << dendl;
4132           newpi->set_snap_seq(s);
4133         }
4134         newpi->set_snap_epoch(pending_inc.epoch);
4135         dout(10) << " added pool " << pool << " snap " << s
4136                  << " to removed_snaps queue" << dendl;
4137         pending_inc.new_removed_snaps[pool].insert(s);
4138       }
4139     }
4140   }
4141
4142   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4143     auto reply = make_message<MRemoveSnaps>();
4144     reply->snaps = m->snaps;
4145     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4146   }
4147
4148   return true;
4149 }
4150
4151 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4152 {
4153   op->mark_osdmon_event(__func__);
4154   auto m = op->get_req<MMonGetPurgedSnaps>();
4155   dout(7) << __func__ << " " << *m << dendl;
4156
4157   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4158
4159   string k = make_purged_snap_epoch_key(m->start);
4160   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
4161   it->upper_bound(k);
4162   unsigned long epoch = m->last;
4163   while (it->valid()) {
4164     if (it->key().find("purged_epoch_") != 0) {
4165       break;
4166     }
4167     string k = it->key();
4168     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4169     if (n != 1) {
4170       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4171     } else if (epoch > m->last) {
4172       break;
4173     } else {
4174       bufferlist bl = it->value();
4175       auto p = bl.cbegin();
4176       auto &v = r[epoch];
4177       try {
4178         ceph::decode(v, p);
4179       } catch (buffer::error& e) {
4180         derr << __func__ << " unable to parse value for key '" << it->key()
4181              << "': \n";
4182         bl.hexdump(*_dout);
4183         *_dout << dendl;
4184       }
4185       n += 4 + v.size() * 16;
4186     }
4187     if (n > 1048576) {
4188       // impose a semi-arbitrary limit to message size
4189       break;
4190     }
4191     it->next();
4192   }
4193
4194   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4195   reply->purged_snaps.swap(r);
4196   mon->send_reply(op, reply.detach());
4197
4198   return true;
4199 }
4200
4201 // osd beacon
4202 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4203 {
4204   op->mark_osdmon_event(__func__);
4205   // check caps
4206   auto session = op->get_session();
4207   mon->no_reply(op);
4208   if (!session) {
4209     dout(10) << __func__ << " no monitor session!" << dendl;
4210     return true;
4211   }
4212   if (!session->is_capable("osd", MON_CAP_X)) {
4213     derr << __func__ << " received from entity "
4214          << "with insufficient privileges " << session->caps << dendl;
4215     return true;
4216   }
4217   // Always forward the beacon to the leader, even if they are the same as
4218   // the old one. The leader will mark as down osds that haven't sent
4219   // beacon for a few minutes.
4220   return false;
4221 }
4222
4223 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4224 {
4225   op->mark_osdmon_event(__func__);
4226   const auto beacon = op->get_req<MOSDBeacon>();
4227   const auto src = beacon->get_orig_source();
4228   dout(10) << __func__ << " " << *beacon
4229            << " from " << src << dendl;
4230   int from = src.num();
4231
4232   if (!src.is_osd() ||
4233       !osdmap.is_up(from) ||
4234       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4235     if (src.is_osd() && !osdmap.is_up(from)) {
4236       // share some new maps with this guy in case it may not be
4237       // aware of its own deadness...
4238       send_latest(op, beacon->version+1);
4239     }
4240     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4241     return false;
4242   }
4243
4244   last_osd_report[from] = ceph_clock_now();
4245   osd_epochs[from] = beacon->version;
4246
4247   for (const auto& pg : beacon->pgs) {
4248     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4249   }
4250
4251   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4252       beacon->last_purged_snaps_scrub) {
4253     if (pending_inc.new_xinfo.count(from) == 0) {
4254       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4255     }
4256     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4257       beacon->last_purged_snaps_scrub;
4258     return true;
4259   } else {
4260     return false;
4261   }
4262 }
4263
4264 // ---------------
4265 // map helpers
4266
4267 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4268 {
4269   op->mark_osdmon_event(__func__);
4270   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4271           << " start " << start << dendl;
4272   if (start == 0)
4273     send_full(op);
4274   else
4275     send_incremental(op, start);
4276 }
4277
4278
4279 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4280 {
4281   MOSDMap *r = new MOSDMap(mon->monmap->fsid, features);
4282   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4283   r->oldest_map = get_first_committed();
4284   r->newest_map = osdmap.get_epoch();
4285   return r;
4286 }
4287
4288 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4289 {
4290   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4291            << std::hex << features << std::dec << dendl;
4292   MOSDMap *m = new MOSDMap(mon->monmap->fsid, features);
4293   m->oldest_map = get_first_committed();
4294   m->newest_map = osdmap.get_epoch();
4295
4296   for (epoch_t e = to; e >= from && e > 0; e--) {
4297     bufferlist bl;
4298     int err = get_version(e, features, bl);
4299     if (err == 0) {
4300       ceph_assert(bl.length());
4301       // if (get_version(e, bl) > 0) {
4302       dout(20) << "build_incremental    inc " << e << " "
4303                << bl.length() << " bytes" << dendl;
4304       m->incremental_maps[e] = bl;
4305     } else {
4306       ceph_assert(err == -ENOENT);
4307       ceph_assert(!bl.length());
4308       get_version_full(e, features, bl);
4309       if (bl.length() > 0) {
4310       //else if (get_version("full", e, bl) > 0) {
4311       dout(20) << "build_incremental   full " << e << " "
4312                << bl.length() << " bytes" << dendl;
4313       m->maps[e] = bl;
4314       } else {
4315         ceph_abort();  // we should have all maps.
4316       }
4317     }
4318   }
4319   return m;
4320 }
4321
4322 void OSDMonitor::send_full(MonOpRequestRef op)
4323 {
4324   op->mark_osdmon_event(__func__);
4325   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4326   mon->send_reply(op, build_latest_full(op->get_session()->con_features));
4327 }
4328
4329 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4330 {
4331   op->mark_osdmon_event(__func__);
4332
4333   MonSession *s = op->get_session();
4334   ceph_assert(s);
4335
4336   if (s->proxy_con) {
4337     // oh, we can tell the other mon to do it
4338     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4339              << first << dendl;
4340     MRoute *r = new MRoute(s->proxy_tid, NULL);
4341     r->send_osdmap_first = first;
4342     s->proxy_con->send_message(r);
4343     op->mark_event("reply: send routed send_osdmap_first reply");
4344   } else {
4345     // do it ourselves
4346     send_incremental(first, s, false, op);
4347   }
4348 }
4349
4350 void OSDMonitor::send_incremental(epoch_t first,
4351                                   MonSession *session,
4352                                   bool onetime,
4353                                   MonOpRequestRef req)
4354 {
4355   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4356           << " to " << session->name << dendl;
4357
4358   // get feature of the peer
4359   // use quorum_con_features, if it's an anonymous connection.
4360   uint64_t features = session->con_features ? session->con_features :
4361     mon->get_quorum_con_features();
4362
4363   if (first <= session->osd_epoch) {
4364     dout(10) << __func__ << " " << session->name << " should already have epoch "
4365              << session->osd_epoch << dendl;
4366     first = session->osd_epoch + 1;
4367   }
4368
4369   if (first < get_first_committed()) {
4370     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4371     m->oldest_map = get_first_committed();
4372     m->newest_map = osdmap.get_epoch();
4373
4374     first = get_first_committed();
4375     bufferlist bl;
4376     int err = get_version_full(first, features, bl);
4377     ceph_assert(err == 0);
4378     ceph_assert(bl.length());
4379     dout(20) << "send_incremental starting with base full "
4380              << first << " " << bl.length() << " bytes" << dendl;
4381     m->maps[first] = bl;
4382
4383     if (req) {
4384       mon->send_reply(req, m);
4385       session->osd_epoch = first;
4386       return;
4387     } else {
4388       session->con->send_message(m);
4389       session->osd_epoch = first;
4390     }
4391     first++;
4392   }
4393
4394   while (first <= osdmap.get_epoch()) {
4395     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4396                                      osdmap.get_epoch());
4397     MOSDMap *m = build_incremental(first, last, features);
4398
4399     if (req) {
4400       // send some maps.  it may not be all of them, but it will get them
4401       // started.
4402       mon->send_reply(req, m);
4403     } else {
4404       session->con->send_message(m);
4405       first = last + 1;
4406     }
4407     session->osd_epoch = last;
4408     if (onetime || req)
4409       break;
4410   }
4411 }
4412
4413 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4414 {
4415   return get_version(ver, mon->get_quorum_con_features(), bl);
4416 }
4417
4418 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4419 {
4420   OSDMap::Incremental inc;
4421   auto q = bl.cbegin();
4422   inc.decode(q);
4423   // always encode with subset of osdmap's canonical features
4424   uint64_t f = features & inc.encode_features;
4425   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4426            << dendl;
4427   bl.clear();
4428   if (inc.fullmap.length()) {
4429     // embedded full map?
4430     OSDMap m;
4431     m.decode(inc.fullmap);
4432     inc.fullmap.clear();
4433     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4434   }
4435   if (inc.crush.length()) {
4436     // embedded crush map
4437     CrushWrapper c;
4438     auto p = inc.crush.cbegin();
4439     c.decode(p);
4440     inc.crush.clear();
4441     c.encode(inc.crush, f);
4442   }
4443   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4444 }
4445
4446 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4447 {
4448   OSDMap m;
4449   auto q = bl.cbegin();
4450   m.decode(q);
4451   // always encode with subset of osdmap's canonical features
4452   uint64_t f = features & m.get_encoding_features();
4453   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4454            << dendl;
4455   bl.clear();
4456   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4457 }
4458
4459 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4460 {
4461   uint64_t significant_features = OSDMap::get_significant_features(features);
4462   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4463     return 0;
4464   }
4465   int ret = PaxosService::get_version(ver, bl);
4466   if (ret < 0) {
4467     return ret;
4468   }
4469   // NOTE: this check is imprecise; the OSDMap encoding features may
4470   // be a subset of the latest mon quorum features, but worst case we
4471   // reencode once and then cache the (identical) result under both
4472   // feature masks.
4473   if (significant_features !=
4474       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4475     reencode_incremental_map(bl, features);
4476   }
4477   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4478   return 0;
4479 }
4480
4481 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4482 {
4483   bufferlist inc_bl;
4484   int err = get_version(ver, inc_bl);
4485   ceph_assert(err == 0);
4486   ceph_assert(inc_bl.length());
4487
4488   auto p = inc_bl.cbegin();
4489   inc.decode(p);
4490   dout(10) << __func__ << "     "
4491            << " epoch " << inc.epoch
4492            << " inc_crc " << inc.inc_crc
4493            << " full_crc " << inc.full_crc
4494            << " encode_features " << inc.encode_features << dendl;
4495   return 0;
4496 }
4497
4498 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4499 {
4500   dout(10) << __func__ << " ver " << ver << dendl;
4501
4502   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4503   if (closest_pinned == 0) {
4504     return -ENOENT;
4505   }
4506   if (closest_pinned > ver) {
4507     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4508   }
4509   ceph_assert(closest_pinned <= ver);
4510
4511   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4512
4513   // get osdmap incremental maps and apply on top of this one.
4514   bufferlist osdm_bl;
4515   bool has_cached_osdmap = false;
4516   for (version_t v = ver-1; v >= closest_pinned; --v) {
4517     if (full_osd_cache.lookup({v, mon->get_quorum_con_features()},
4518                                 &osdm_bl)) {
4519       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4520       closest_pinned = v;
4521       has_cached_osdmap = true;
4522       break;
4523     }
4524   }
4525
4526   if (!has_cached_osdmap) {
4527     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4528     if (err != 0) {
4529       derr << __func__ << " closest pinned map ver " << closest_pinned
4530            << " not available! error: " << cpp_strerror(err) << dendl;
4531     }
4532     ceph_assert(err == 0);
4533   }
4534
4535   ceph_assert(osdm_bl.length());
4536
4537   OSDMap osdm;
4538   osdm.decode(osdm_bl);
4539
4540   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4541            << " e" << osdm.epoch
4542            << " crc " << osdm.get_crc()
4543            << " -- applying incremental maps." << dendl;
4544
4545   uint64_t encode_features = 0;
4546   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4547     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4548
4549     OSDMap::Incremental inc;
4550     int err = get_inc(v, inc);
4551     ceph_assert(err == 0);
4552
4553     encode_features = inc.encode_features;
4554
4555     err = osdm.apply_incremental(inc);
4556     ceph_assert(err == 0);
4557
4558     // this block performs paranoid checks on map retrieval
4559     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4560         inc.full_crc != 0) {
4561
4562       uint64_t f = encode_features;
4563       if (!f) {
4564         f = (mon->quorum_con_features ? mon->quorum_con_features : -1);
4565       }
4566
4567       // encode osdmap to force calculating crcs
4568       bufferlist tbl;
4569       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4570       // decode osdmap to compare crcs with what's expected by incremental
4571       OSDMap tosdm;
4572       tosdm.decode(tbl);
4573
4574       if (tosdm.get_crc() != inc.full_crc) {
4575         derr << __func__
4576              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4577              << ", expected " << inc.full_crc << ")" << dendl;
4578         ceph_abort_msg("osdmap crc mismatch");
4579       }
4580     }
4581
4582     // note: we cannot add the recently computed map to the cache, as is,
4583     // because we have not encoded the map into a bl.
4584   }
4585
4586   if (!encode_features) {
4587     dout(10) << __func__
4588              << " last incremental map didn't have features;"
4589              << " defaulting to quorum's or all" << dendl;
4590     encode_features =
4591       (mon->quorum_con_features ? mon->quorum_con_features : -1);
4592   }
4593   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4594
4595   return 0;
4596 }
4597
4598 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4599 {
4600   return get_version_full(ver, mon->get_quorum_con_features(), bl);
4601 }
4602
4603 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4604                                  bufferlist& bl)
4605 {
4606   uint64_t significant_features = OSDMap::get_significant_features(features);
4607   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4608     return 0;
4609   }
4610   int ret = PaxosService::get_version_full(ver, bl);
4611   if (ret == -ENOENT) {
4612     // build map?
4613     ret = get_full_from_pinned_map(ver, bl);
4614   }
4615   if (ret < 0) {
4616     return ret;
4617   }
4618   // NOTE: this check is imprecise; the OSDMap encoding features may
4619   // be a subset of the latest mon quorum features, but worst case we
4620   // reencode once and then cache the (identical) result under both
4621   // feature masks.
4622   if (significant_features !=
4623       OSDMap::get_significant_features(mon->get_quorum_con_features())) {
4624     reencode_full_map(bl, features);
4625   }
4626   full_osd_cache.add_bytes({ver, significant_features}, bl);
4627   return 0;
4628 }
4629
4630 epoch_t OSDMonitor::blacklist(const entity_addrvec_t& av, utime_t until)
4631 {
4632   dout(10) << "blacklist " << av << " until " << until << dendl;
4633   for (auto a : av.v) {
4634     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4635       a.set_type(entity_addr_t::TYPE_ANY);
4636     } else {
4637       a.set_type(entity_addr_t::TYPE_LEGACY);
4638     }
4639     pending_inc.new_blacklist[a] = until;
4640   }
4641   return pending_inc.epoch;
4642 }
4643
4644 epoch_t OSDMonitor::blacklist(entity_addr_t a, utime_t until)
4645 {
4646   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4647     a.set_type(entity_addr_t::TYPE_ANY);
4648   } else {
4649     a.set_type(entity_addr_t::TYPE_LEGACY);
4650   }
4651   dout(10) << "blacklist " << a << " until " << until << dendl;
4652   pending_inc.new_blacklist[a] = until;
4653   return pending_inc.epoch;
4654 }
4655
4656
4657 void OSDMonitor::check_osdmap_subs()
4658 {
4659   dout(10) << __func__ << dendl;
4660   if (!osdmap.get_epoch()) {
4661     return;
4662   }
4663   auto osdmap_subs = mon->session_map.subs.find("osdmap");
4664   if (osdmap_subs == mon->session_map.subs.end()) {
4665     return;
4666   }
4667   auto p = osdmap_subs->second->begin();
4668   while (!p.end()) {
4669     auto sub = *p;
4670     ++p;
4671     check_osdmap_sub(sub);
4672   }
4673 }
4674
4675 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4676 {
4677   dout(10) << __func__ << " " << sub << " next " << sub->next
4678            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4679   if (sub->next <= osdmap.get_epoch()) {
4680     if (sub->next >= 1)
4681       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4682     else
4683       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4684     if (sub->onetime)
4685       mon->session_map.remove_sub(sub);
4686     else
4687       sub->next = osdmap.get_epoch() + 1;
4688   }
4689 }
4690
4691 void OSDMonitor::check_pg_creates_subs()
4692 {
4693   if (!osdmap.get_num_up_osds()) {
4694     return;
4695   }
4696   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4697   mon->with_session_map([this](const MonSessionMap& session_map) {
4698       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4699       if (pg_creates_subs == session_map.subs.end()) {
4700         return;
4701       }
4702       for (auto sub : *pg_creates_subs->second) {
4703         check_pg_creates_sub(sub);
4704       }
4705     });
4706 }
4707
4708 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4709 {
4710   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4711   ceph_assert(sub->type == "osd_pg_creates");
4712   // only send these if the OSD is up.  we will check_subs() when they do
4713   // come up so they will get the creates then.
4714   if (sub->session->name.is_osd() &&
4715       mon->osdmon()->osdmap.is_up(sub->session->name.num())) {
4716     sub->next = send_pg_creates(sub->session->name.num(),
4717                                 sub->session->con.get(),
4718                                 sub->next);
4719   }
4720 }
4721
4722 void OSDMonitor::do_application_enable(int64_t pool_id,
4723                                        const std::string &app_name,
4724                                        const std::string &app_key,
4725                                        const std::string &app_value)
4726 {
4727   ceph_assert(paxos->is_plugged() && is_writeable());
4728
4729   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4730            << dendl;
4731
4732   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4733
4734   auto pp = osdmap.get_pg_pool(pool_id);
4735   ceph_assert(pp != nullptr);
4736
4737   pg_pool_t p = *pp;
4738   if (pending_inc.new_pools.count(pool_id)) {
4739     p = pending_inc.new_pools[pool_id];
4740   }
4741
4742   if (app_key.empty()) {
4743     p.application_metadata.insert({app_name, {}});
4744   } else {
4745     p.application_metadata.insert({app_name, {{app_key, app_value}}});
4746   }
4747   p.last_change = pending_inc.epoch;
4748   pending_inc.new_pools[pool_id] = p;
4749 }
4750
4751 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4752                                  pool_opts_t::key_t opt,
4753                                  pool_opts_t::value_t val)
4754 {
4755   auto p = pending_inc.new_pools.try_emplace(
4756     pool_id, *osdmap.get_pg_pool(pool_id));
4757   p.first->second.opts.set(opt, val);
4758 }
4759
4760 unsigned OSDMonitor::scan_for_creating_pgs(
4761   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4762   const mempool::osdmap::set<int64_t>& removed_pools,
4763   utime_t modified,
4764   creating_pgs_t* creating_pgs) const
4765 {
4766   unsigned queued = 0;
4767   for (auto& p : pools) {
4768     int64_t poolid = p.first;
4769     if (creating_pgs->created_pools.count(poolid)) {
4770       dout(10) << __func__ << " already created " << poolid << dendl;
4771       continue;
4772     }
4773     const pg_pool_t& pool = p.second;
4774     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4775                                          pool.get_type(), pool.get_size());
4776     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4777       continue;
4778
4779     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4780     const auto created = pool.get_last_change();
4781     if (last_scan_epoch && created <= last_scan_epoch) {
4782       dout(10) << __func__ << " no change in pool " << poolid
4783                << " " << pool << dendl;
4784       continue;
4785     }
4786     if (removed_pools.count(poolid)) {
4787       dout(10) << __func__ << " pool is being removed: " << poolid
4788                << " " << pool << dendl;
4789       continue;
4790     }
4791     dout(10) << __func__ << " queueing pool create for " << poolid
4792              << " " << pool << dendl;
4793     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4794                               created, modified);
4795     queued++;
4796   }
4797   return queued;
4798 }
4799
4800 void OSDMonitor::update_creating_pgs()
4801 {
4802   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4803            << creating_pgs.queue.size() << " pools in queue" << dendl;
4804   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4805   std::lock_guard<std::mutex> l(creating_pgs_lock);
4806   for (const auto& pg : creating_pgs.pgs) {
4807     int acting_primary = -1;
4808     auto pgid = pg.first;
4809     if (!osdmap.pg_exists(pgid)) {
4810       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4811                << dendl;
4812       continue;
4813     }
4814     auto mapped = pg.second.create_epoch;
4815     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4816     spg_t spgid(pgid);
4817     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4818     // check the previous creating_pgs, look for the target to whom the pg was
4819     // previously mapped
4820     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4821       const auto last_acting_primary = pgs_by_epoch.first;
4822       for (auto& pgs: pgs_by_epoch.second) {
4823         if (pgs.second.count(spgid)) {
4824           if (last_acting_primary == acting_primary) {
4825             mapped = pgs.first;
4826           } else {
4827             dout(20) << __func__ << " " << pgid << " "
4828                      << " acting_primary:" << last_acting_primary
4829                      << " -> " << acting_primary << dendl;
4830             // note epoch if the target of the create message changed.
4831             mapped = mapping.get_epoch();
4832           }
4833           break;
4834         } else {
4835           // newly creating
4836           mapped = mapping.get_epoch();
4837         }
4838       }
4839     }
4840     dout(10) << __func__ << " will instruct osd." << acting_primary
4841              << " to create " << pgid << "@" << mapped << dendl;
4842     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
4843   }
4844   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
4845   creating_pgs_epoch = mapping.get_epoch();
4846 }
4847
4848 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
4849 {
4850   dout(30) << __func__ << " osd." << osd << " next=" << next
4851            << " " << creating_pgs_by_osd_epoch << dendl;
4852   std::lock_guard<std::mutex> l(creating_pgs_lock);
4853   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
4854     dout(20) << __func__
4855              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
4856     // the subscribers will be updated when the mapping is completed anyway
4857     return next;
4858   }
4859   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
4860   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
4861     return next;
4862   ceph_assert(!creating_pgs_by_epoch->second.empty());
4863
4864   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
4865   MOSDPGCreate2 *m = nullptr;
4866
4867   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
4868
4869   epoch_t last = 0;
4870   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
4871        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
4872     auto epoch = epoch_pgs->first;
4873     auto& pgs = epoch_pgs->second;
4874     dout(20) << __func__ << " osd." << osd << " from " << next
4875              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
4876     last = epoch;
4877     for (auto& pg : pgs) {
4878       // Need the create time from the monitor using its clock to set
4879       // last_scrub_stamp upon pg creation.
4880       auto create = creating_pgs.pgs.find(pg.pgid);
4881       ceph_assert(create != creating_pgs.pgs.end());
4882       if (old) {
4883         if (!oldm) {
4884           oldm = new MOSDPGCreate(creating_pgs_epoch);
4885         }
4886         oldm->mkpg.emplace(pg.pgid,
4887                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
4888         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
4889       } else {
4890         if (!m) {
4891           m = new MOSDPGCreate2(creating_pgs_epoch);
4892         }
4893         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
4894                                      create->second.create_stamp));
4895         if (create->second.history.epoch_created) {
4896           dout(20) << __func__ << "   " << pg << " " << create->second.history
4897                    << " " << create->second.past_intervals << dendl;
4898           m->pg_extra.emplace(pg, make_pair(create->second.history,
4899                                             create->second.past_intervals));
4900         }
4901       }
4902       dout(20) << __func__ << " will create " << pg
4903                << " at " << create->second.create_epoch << dendl;
4904     }
4905   }
4906   if (m) {
4907     con->send_message(m);
4908   } else if (oldm) {
4909     con->send_message(oldm);
4910   } else {
4911     dout(20) << __func__ << " osd." << osd << " from " << next
4912              << " has nothing to send" << dendl;
4913     return next;
4914   }
4915
4916   // sub is current through last + 1
4917   return last + 1;
4918 }
4919
4920 // TICK
4921
4922
4923 void OSDMonitor::tick()
4924 {
4925   if (!is_active()) return;
4926
4927   dout(10) << osdmap << dendl;
4928
4929   // always update osdmap manifest, regardless of being the leader.
4930   load_osdmap_manifest();
4931
4932   if (!mon->is_leader()) return;
4933
4934   bool do_propose = false;
4935   utime_t now = ceph_clock_now();
4936
4937   if (handle_osd_timeouts(now, last_osd_report)) {
4938     do_propose = true;
4939   }
4940
4941   // mark osds down?
4942   if (check_failures(now)) {
4943     do_propose = true;
4944   }
4945
4946   // Force a proposal if we need to prune; pruning is performed on
4947   // ``encode_pending()``, hence why we need to regularly trigger a proposal
4948   // even if there's nothing going on.
4949   if (is_prune_enabled() && should_prune()) {
4950     do_propose = true;
4951   }
4952
4953   // mark down osds out?
4954
4955   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
4956    * influence at all. The decision is made based on the ratio of "in" osds,
4957    * and the function returns false if this ratio is lower that the minimum
4958    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
4959    */
4960   if (can_mark_out(-1)) {
4961     string down_out_subtree_limit = g_conf().get_val<string>(
4962       "mon_osd_down_out_subtree_limit");
4963     set<int> down_cache;  // quick cache of down subtrees
4964
4965     map<int,utime_t>::iterator i = down_pending_out.begin();
4966     while (i != down_pending_out.end()) {
4967       int o = i->first;
4968       utime_t down = now;
4969       down -= i->second;
4970       ++i;
4971
4972       if (osdmap.is_down(o) &&
4973           osdmap.is_in(o) &&
4974           can_mark_out(o)) {
4975         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
4976         utime_t grace = orig_grace;
4977         double my_grace = 0.0;
4978
4979         if (g_conf()->mon_osd_adjust_down_out_interval) {
4980           // scale grace period the same way we do the heartbeat grace.
4981           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
4982           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
4983           double decay_k = ::log(.5) / halflife;
4984           double decay = exp((double)down * decay_k);
4985           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
4986                    << " down for " << down << " decay " << decay << dendl;
4987           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
4988           grace += my_grace;
4989         }
4990
4991         // is this an entire large subtree down?
4992         if (down_out_subtree_limit.length()) {
4993           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
4994           if (type > 0) {
4995             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
4996               dout(10) << "tick entire containing " << down_out_subtree_limit
4997                        << " subtree for osd." << o
4998                        << " is down; resetting timer" << dendl;
4999               // reset timer, too.
5000               down_pending_out[o] = now;
5001               continue;
5002             }
5003           }
5004         }
5005
5006         bool down_out = !osdmap.is_destroyed(o) &&
5007           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5008         bool destroyed_out = osdmap.is_destroyed(o) &&
5009           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5010         // this is not precise enough as we did not make a note when this osd
5011         // was marked as destroyed, but let's not bother with that
5012         // complexity for now.
5013           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5014         if (down_out || destroyed_out) {
5015           dout(10) << "tick marking osd." << o << " OUT after " << down
5016                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5017           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5018
5019           // set the AUTOOUT bit.
5020           if (pending_inc.new_state.count(o) == 0)
5021             pending_inc.new_state[o] = 0;
5022           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5023
5024           // remember previous weight
5025           if (pending_inc.new_xinfo.count(o) == 0)
5026             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5027           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5028
5029           do_propose = true;
5030
5031           mon->clog->info() << "Marking osd." << o << " out (has been down for "
5032                             << int(down.sec()) << " seconds)";
5033         } else
5034           continue;
5035       }
5036
5037       down_pending_out.erase(o);
5038     }
5039   } else {
5040     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5041   }
5042
5043   // expire blacklisted items?
5044   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5045        p != osdmap.blacklist.end();
5046        ++p) {
5047     if (p->second < now) {
5048       dout(10) << "expiring blacklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5049       pending_inc.old_blacklist.push_back(p->first);
5050       do_propose = true;
5051     }
5052   }
5053
5054   if (try_prune_purged_snaps()) {
5055     do_propose = true;
5056   }
5057
5058   if (update_pools_status())
5059     do_propose = true;
5060
5061   if (do_propose ||
5062       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5063     propose_pending();
5064
5065   {
5066     std::lock_guard l(balancer_lock);
5067     if (ceph_using_tcmalloc() && mon_memory_autotune && pcm != nullptr) {
5068       pcm->tune_memory();
5069       pcm->balance();
5070       _set_new_cache_sizes();
5071       dout(10) << "tick balancer "
5072                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5073                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5074                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5075                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5076                << dendl;
5077       dout(10) << "tick balancer "
5078                << " full cache_bytes: " << full_cache->get_cache_bytes()
5079                << " full comtd_bytes: " << full_cache->get_committed_size()
5080                << " full used_bytes: " << full_cache->_get_used_bytes()
5081                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5082                << dendl;
5083     }
5084   }
5085 }
5086
5087 void OSDMonitor::_set_new_cache_sizes()
5088 {
5089   uint64_t cache_size = 0;
5090   int64_t inc_alloc = 0;
5091   int64_t full_alloc = 0;
5092   int64_t kv_alloc = 0;
5093
5094   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5095     cache_size = pcm->get_tuned_mem();
5096     inc_alloc = inc_cache->get_committed_size();
5097     full_alloc = full_cache->get_committed_size();
5098     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5099   }
5100
5101   inc_osd_cache.set_bytes(inc_alloc);
5102   full_osd_cache.set_bytes(full_alloc);
5103
5104   dout(1) << __func__ << " cache_size:" << cache_size
5105            << " inc_alloc: " << inc_alloc
5106            << " full_alloc: " << full_alloc
5107            << " kv_alloc: " << kv_alloc
5108            << dendl;
5109 }
5110
5111 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5112                                      std::map<int,utime_t> &last_osd_report)
5113 {
5114   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5115   if (now - mon->get_leader_since() < timeo) {
5116     // We haven't been the leader for long enough to consider OSD timeouts
5117     return false;
5118   }
5119
5120   int max_osd = osdmap.get_max_osd();
5121   bool new_down = false;
5122
5123   for (int i=0; i < max_osd; ++i) {
5124     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5125     if (!osdmap.exists(i)) {
5126       last_osd_report.erase(i); // if any
5127       continue;
5128     }
5129     if (!osdmap.is_up(i))
5130       continue;
5131     const std::map<int,utime_t>::const_iterator t = last_osd_report.find(i);
5132     if (t == last_osd_report.end()) {
5133       // it wasn't in the map; start the timer.
5134       last_osd_report[i] = now;
5135     } else if (can_mark_down(i)) {
5136       utime_t diff = now - t->second;
5137       if (diff > timeo) {
5138         mon->clog->info() << "osd." << i << " marked down after no beacon for "
5139                           << diff << " seconds";
5140         derr << "no beacon from osd." << i << " since " << t->second
5141              << ", " << diff << " seconds ago.  marking down" << dendl;
5142         pending_inc.new_state[i] = CEPH_OSD_UP;
5143         new_down = true;
5144       }
5145     }
5146   }
5147   return new_down;
5148 }
5149
5150 static void dump_cpu_list(Formatter *f, const char *name,
5151                           const string& strlist)
5152 {
5153   cpu_set_t cpu_set;
5154   size_t cpu_set_size;
5155   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5156     return;
5157   }
5158   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5159   f->open_array_section(name);
5160   for (auto cpu : cpus) {
5161     f->dump_int("cpu", cpu);
5162   }
5163   f->close_section();
5164 }
5165
5166 void OSDMonitor::dump_info(Formatter *f)
5167 {
5168   f->open_object_section("osdmap");
5169   osdmap.dump(f);
5170   f->close_section();
5171
5172   f->open_array_section("osd_metadata");
5173   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5174     if (osdmap.exists(i)) {
5175       f->open_object_section("osd");
5176       f->dump_unsigned("id", i);
5177       dump_osd_metadata(i, f, NULL);
5178       f->close_section();
5179     }
5180   }
5181   f->close_section();
5182
5183   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5184   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5185
5186   f->open_object_section("crushmap");
5187   osdmap.crush->dump(f);
5188   f->close_section();
5189
5190   if (has_osdmap_manifest) {
5191     f->open_object_section("osdmap_manifest");
5192     osdmap_manifest.dump(f);
5193     f->close_section();
5194   }
5195 }
5196
5197 namespace {
5198   enum osd_pool_get_choices {
5199     SIZE, MIN_SIZE,
5200     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5201     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5202     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5203     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5204     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5205     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5206     CACHE_TARGET_FULL_RATIO,
5207     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5208     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5209     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5210     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5211     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5212     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5213     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5214     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5215     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5216     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5217     PG_AUTOSCALE_BIAS };
5218
5219   std::set<osd_pool_get_choices>
5220     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5221                                 const std::set<osd_pool_get_choices>& second)
5222     {
5223       std::set<osd_pool_get_choices> result;
5224       std::set_difference(first.begin(), first.end(),
5225                           second.begin(), second.end(),
5226                           std::inserter(result, result.end()));
5227       return result;
5228     }
5229 }
5230
5231
5232 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5233 {
5234   op->mark_osdmon_event(__func__);
5235   auto m = op->get_req<MMonCommand>();
5236   int r = 0;
5237   bufferlist rdata;
5238   stringstream ss, ds;
5239
5240   cmdmap_t cmdmap;
5241   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5242     string rs = ss.str();
5243     mon->reply_command(op, -EINVAL, rs, get_last_committed());
5244     return true;
5245   }
5246
5247   MonSession *session = op->get_session();
5248   if (!session) {
5249     derr << __func__ << " no session" << dendl;
5250     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
5251     return true;
5252   }
5253
5254   string prefix;
5255   cmd_getval(cmdmap, "prefix", prefix);
5256
5257   string format;
5258   cmd_getval(cmdmap, "format", format, string("plain"));
5259   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5260
5261   if (prefix == "osd stat") {
5262     if (f) {
5263       f->open_object_section("osdmap");
5264       osdmap.print_summary(f.get(), ds, "", true);
5265       f->close_section();
5266       f->flush(rdata);
5267     } else {
5268       osdmap.print_summary(nullptr, ds, "", true);
5269       rdata.append(ds);
5270     }
5271   }
5272   else if (prefix == "osd dump" ||
5273            prefix == "osd tree" ||
5274            prefix == "osd tree-from" ||
5275            prefix == "osd ls" ||
5276            prefix == "osd getmap" ||
5277            prefix == "osd getcrushmap" ||
5278            prefix == "osd ls-tree" ||
5279            prefix == "osd info") {
5280     string val;
5281
5282     epoch_t epoch = 0;
5283     int64_t epochnum;
5284     cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5285     epoch = epochnum;
5286
5287     bufferlist osdmap_bl;
5288     int err = get_version_full(epoch, osdmap_bl);
5289     if (err == -ENOENT) {
5290       r = -ENOENT;
5291       ss << "there is no map for epoch " << epoch;
5292       goto reply;
5293     }
5294     ceph_assert(err == 0);
5295     ceph_assert(osdmap_bl.length());
5296
5297     OSDMap *p;
5298     if (epoch == osdmap.get_epoch()) {
5299       p = &osdmap;
5300     } else {
5301       p = new OSDMap;
5302       p->decode(osdmap_bl);
5303     }
5304
5305     auto sg = make_scope_guard([&] {
5306       if (p != &osdmap) {
5307         delete p;
5308       }
5309     });
5310
5311     if (prefix == "osd dump") {
5312       stringstream ds;
5313       if (f) {
5314         f->open_object_section("osdmap");
5315         p->dump(f.get());
5316         f->close_section();
5317         f->flush(ds);
5318       } else {
5319         p->print(ds);
5320       }
5321       rdata.append(ds);
5322       if (!f)
5323         ds << " ";
5324     } else if (prefix == "osd ls") {
5325       if (f) {
5326         f->open_array_section("osds");
5327         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5328           if (osdmap.exists(i)) {
5329             f->dump_int("osd", i);
5330           }
5331         }
5332         f->close_section();
5333         f->flush(ds);
5334       } else {
5335         bool first = true;
5336         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5337           if (osdmap.exists(i)) {
5338             if (!first)
5339               ds << "\n";
5340             first = false;
5341             ds << i;
5342           }
5343         }
5344       }
5345       rdata.append(ds);
5346     } else if (prefix == "osd info") {
5347       int64_t osd_id;
5348       bool do_single_osd = true;
5349       if (!cmd_getval(cmdmap, "id", osd_id)) {
5350         do_single_osd = false;
5351       }
5352
5353       if (do_single_osd && !osdmap.exists(osd_id)) {
5354         ss << "osd." << osd_id << " does not exist";
5355         r = -EINVAL;
5356         goto reply;
5357       }
5358
5359       if (f) {
5360         if (do_single_osd) {
5361           osdmap.dump_osd(osd_id, f.get());
5362         } else {
5363           osdmap.dump_osds(f.get());
5364         }
5365         f->flush(ds);
5366       } else {
5367         if (do_single_osd) {
5368           osdmap.print_osd(osd_id, ds);
5369         } else {
5370           osdmap.print_osds(ds);
5371         }
5372       }
5373       rdata.append(ds);
5374     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5375       string bucket;
5376       if (prefix == "osd tree-from") {
5377         cmd_getval(cmdmap, "bucket", bucket);
5378         if (!osdmap.crush->name_exists(bucket)) {
5379           ss << "bucket '" << bucket << "' does not exist";
5380           r = -ENOENT;
5381           goto reply;
5382         }
5383         int id = osdmap.crush->get_item_id(bucket);
5384         if (id >= 0) {
5385           ss << "\"" << bucket << "\" is not a bucket";
5386           r = -EINVAL;
5387           goto reply;
5388         }
5389       }
5390
5391       vector<string> states;
5392       cmd_getval(cmdmap, "states", states);
5393       unsigned filter = 0;
5394       for (auto& s : states) {
5395         if (s == "up") {
5396           filter |= OSDMap::DUMP_UP;
5397         } else if (s == "down") {
5398           filter |= OSDMap::DUMP_DOWN;
5399         } else if (s == "in") {
5400           filter |= OSDMap::DUMP_IN;
5401         } else if (s == "out") {
5402           filter |= OSDMap::DUMP_OUT;
5403         } else if (s == "destroyed") {
5404           filter |= OSDMap::DUMP_DESTROYED;
5405         } else {
5406           ss << "unrecognized state '" << s << "'";
5407           r = -EINVAL;
5408           goto reply;
5409         }
5410       }
5411       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5412           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5413         ss << "cannot specify both 'in' and 'out'";
5414         r = -EINVAL;
5415         goto reply;
5416       }
5417       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5418            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5419            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5420            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5421            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5422            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5423         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5424         r = -EINVAL;
5425         goto reply;
5426       }
5427       if (f) {
5428         f->open_object_section("tree");
5429         p->print_tree(f.get(), NULL, filter, bucket);
5430         f->close_section();
5431         f->flush(ds);
5432       } else {
5433         p->print_tree(NULL, &ds, filter, bucket);
5434       }
5435       rdata.append(ds);
5436     } else if (prefix == "osd getmap") {
5437       rdata.append(osdmap_bl);
5438       ss << "got osdmap epoch " << p->get_epoch();
5439     } else if (prefix == "osd getcrushmap") {
5440       p->crush->encode(rdata, mon->get_quorum_con_features());
5441       ss << p->get_crush_version();
5442     } else if (prefix == "osd ls-tree") {
5443       string bucket_name;
5444       cmd_getval(cmdmap, "name", bucket_name);
5445       set<int> osds;
5446       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5447       if (r == -ENOENT) {
5448         ss << "\"" << bucket_name << "\" does not exist";
5449         goto reply;
5450       } else if (r < 0) {
5451         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5452         goto reply;
5453       }
5454
5455       if (f) {
5456         f->open_array_section("osds");
5457         for (auto &i : osds) {
5458           if (osdmap.exists(i)) {
5459             f->dump_int("osd", i);
5460           }
5461         }
5462         f->close_section();
5463         f->flush(ds);
5464       } else {
5465         bool first = true;
5466         for (auto &i : osds) {
5467           if (osdmap.exists(i)) {
5468             if (!first)
5469               ds << "\n";
5470             first = false;
5471             ds << i;
5472           }
5473         }
5474       }
5475
5476       rdata.append(ds);
5477     }
5478   } else if (prefix == "osd getmaxosd") {
5479     if (f) {
5480       f->open_object_section("getmaxosd");
5481       f->dump_unsigned("epoch", osdmap.get_epoch());
5482       f->dump_int("max_osd", osdmap.get_max_osd());
5483       f->close_section();
5484       f->flush(rdata);
5485     } else {
5486       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5487       rdata.append(ds);
5488     }
5489   } else if (prefix == "osd utilization") {
5490     string out;
5491     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5492     if (f)
5493       f->flush(rdata);
5494     else
5495       rdata.append(out);
5496     r = 0;
5497     goto reply;
5498   } else if (prefix  == "osd find") {
5499     int64_t osd;
5500     if (!cmd_getval(cmdmap, "id", osd)) {
5501       ss << "unable to parse osd id value '"
5502          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5503       r = -EINVAL;
5504       goto reply;
5505     }
5506     if (!osdmap.exists(osd)) {
5507       ss << "osd." << osd << " does not exist";
5508       r = -ENOENT;
5509       goto reply;
5510     }
5511     string format;
5512     cmd_getval(cmdmap, "format", format);
5513     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5514     f->open_object_section("osd_location");
5515     f->dump_int("osd", osd);
5516     f->dump_object("addrs", osdmap.get_addrs(osd));
5517     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5518
5519     // try to identify host, pod/container name, etc.
5520     map<string,string> m;
5521     load_metadata(osd, m, nullptr);
5522     if (auto p = m.find("hostname"); p != m.end()) {
5523       f->dump_string("host", p->second);
5524     }
5525     for (auto& k : {
5526         "pod_name", "pod_namespace", // set by rook
5527         "container_name"             // set by cephadm, ceph-ansible
5528         }) {
5529       if (auto p = m.find(k); p != m.end()) {
5530         f->dump_string(k, p->second);
5531       }
5532     }
5533
5534     // crush is helpful too
5535     f->open_object_section("crush_location");
5536     map<string,string> loc = osdmap.crush->get_full_location(osd);
5537     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5538       f->dump_string(p->first.c_str(), p->second);
5539     f->close_section();
5540     f->close_section();
5541     f->flush(rdata);
5542   } else if (prefix == "osd metadata") {
5543     int64_t osd = -1;
5544     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5545         !cmd_getval(cmdmap, "id", osd)) {
5546       ss << "unable to parse osd id value '"
5547          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5548       r = -EINVAL;
5549       goto reply;
5550     }
5551     if (osd >= 0 && !osdmap.exists(osd)) {
5552       ss << "osd." << osd << " does not exist";
5553       r = -ENOENT;
5554       goto reply;
5555     }
5556     string format;
5557     cmd_getval(cmdmap, "format", format);
5558     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5559     if (osd >= 0) {
5560       f->open_object_section("osd_metadata");
5561       f->dump_unsigned("id", osd);
5562       r = dump_osd_metadata(osd, f.get(), &ss);
5563       if (r < 0)
5564         goto reply;
5565       f->close_section();
5566     } else {
5567       r = 0;
5568       f->open_array_section("osd_metadata");
5569       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5570         if (osdmap.exists(i)) {
5571           f->open_object_section("osd");
5572           f->dump_unsigned("id", i);
5573           r = dump_osd_metadata(i, f.get(), NULL);
5574           if (r == -EINVAL || r == -ENOENT) {
5575             // Drop error, continue to get other daemons' metadata
5576             dout(4) << "No metadata for osd." << i << dendl;
5577             r = 0;
5578           } else if (r < 0) {
5579             // Unexpected error
5580             goto reply;
5581           }
5582           f->close_section();
5583         }
5584       }
5585       f->close_section();
5586     }
5587     f->flush(rdata);
5588   } else if (prefix == "osd versions") {
5589     if (!f)
5590       f.reset(Formatter::create("json-pretty"));
5591     count_metadata("ceph_version", f.get());
5592     f->flush(rdata);
5593     r = 0;
5594   } else if (prefix == "osd count-metadata") {
5595     if (!f)
5596       f.reset(Formatter::create("json-pretty"));
5597     string field;
5598     cmd_getval(cmdmap, "property", field);
5599     count_metadata(field, f.get());
5600     f->flush(rdata);
5601     r = 0;
5602   } else if (prefix == "osd numa-status") {
5603     TextTable tbl;
5604     if (f) {
5605       f->open_array_section("osds");
5606     } else {
5607       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5608       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5609       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5610       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5611       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5612       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5613     }
5614     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5615       if (osdmap.exists(i)) {
5616         map<string,string> m;
5617         ostringstream err;
5618         if (load_metadata(i, m, &err) < 0) {
5619           continue;
5620         }
5621         string host;
5622         auto p = m.find("hostname");
5623         if (p != m.end()) {
5624           host = p->second;
5625         }
5626         if (f) {
5627           f->open_object_section("osd");
5628           f->dump_int("osd", i);
5629           f->dump_string("host", host);
5630           for (auto n : { "network_numa_node", "objectstore_numa_node",
5631                 "numa_node" }) {
5632             p = m.find(n);
5633             if (p != m.end()) {
5634               f->dump_int(n, atoi(p->second.c_str()));
5635             }
5636           }
5637           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5638             p = m.find(n);
5639             if (p != m.end()) {
5640               list<string> ls = get_str_list(p->second, ",");
5641               f->open_array_section(n);
5642               for (auto node : ls) {
5643                 f->dump_int("node", atoi(node.c_str()));
5644               }
5645               f->close_section();
5646             }
5647           }
5648           for (auto n : { "numa_node_cpus" }) {
5649             p = m.find(n);
5650             if (p != m.end()) {
5651               dump_cpu_list(f.get(), n, p->second);
5652             }
5653           }
5654           f->close_section();
5655         } else {
5656           tbl << i;
5657           tbl << host;
5658           p = m.find("network_numa_nodes");
5659           if (p != m.end()) {
5660             tbl << p->second;
5661           } else {
5662             tbl << "-";
5663           }
5664           p = m.find("objectstore_numa_nodes");
5665           if (p != m.end()) {
5666             tbl << p->second;
5667           } else {
5668             tbl << "-";
5669           }
5670           p = m.find("numa_node");
5671           auto q = m.find("numa_node_cpus");
5672           if (p != m.end() && q != m.end()) {
5673             tbl << p->second;
5674             tbl << q->second;
5675           } else {
5676             tbl << "-";
5677             tbl << "-";
5678           }
5679           tbl << TextTable::endrow;
5680         }
5681       }
5682     }
5683     if (f) {
5684       f->close_section();
5685       f->flush(rdata);
5686     } else {
5687       rdata.append(stringify(tbl));
5688     }
5689   } else if (prefix == "osd map") {
5690     string poolstr, objstr, namespacestr;
5691     cmd_getval(cmdmap, "pool", poolstr);
5692     cmd_getval(cmdmap, "object", objstr);
5693     cmd_getval(cmdmap, "nspace", namespacestr);
5694
5695     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5696     if (pool < 0) {
5697       ss << "pool " << poolstr << " does not exist";
5698       r = -ENOENT;
5699       goto reply;
5700     }
5701     object_locator_t oloc(pool, namespacestr);
5702     object_t oid(objstr);
5703     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5704     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5705     vector<int> up, acting;
5706     int up_p, acting_p;
5707     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5708
5709     string fullobjname;
5710     if (!namespacestr.empty())
5711       fullobjname = namespacestr + string("/") + oid.name;
5712     else
5713       fullobjname = oid.name;
5714     if (f) {
5715       f->open_object_section("osd_map");
5716       f->dump_unsigned("epoch", osdmap.get_epoch());
5717       f->dump_string("pool", poolstr);
5718       f->dump_int("pool_id", pool);
5719       f->dump_stream("objname") << fullobjname;
5720       f->dump_stream("raw_pgid") << pgid;
5721       f->dump_stream("pgid") << mpgid;
5722       f->open_array_section("up");
5723       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5724         f->dump_int("osd", *p);
5725       f->close_section();
5726       f->dump_int("up_primary", up_p);
5727       f->open_array_section("acting");
5728       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5729         f->dump_int("osd", *p);
5730       f->close_section();
5731       f->dump_int("acting_primary", acting_p);
5732       f->close_section(); // osd_map
5733       f->flush(rdata);
5734     } else {
5735       ds << "osdmap e" << osdmap.get_epoch()
5736         << " pool '" << poolstr << "' (" << pool << ")"
5737         << " object '" << fullobjname << "' ->"
5738         << " pg " << pgid << " (" << mpgid << ")"
5739         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5740         << pg_vector_string(acting) << ", p" << acting_p << ")";
5741       rdata.append(ds);
5742     }
5743
5744   } else if (prefix == "pg map") {
5745     pg_t pgid;
5746     string pgidstr;
5747     cmd_getval(cmdmap, "pgid", pgidstr);
5748     if (!pgid.parse(pgidstr.c_str())) {
5749       ss << "invalid pgid '" << pgidstr << "'";
5750       r = -EINVAL;
5751       goto reply;
5752     }
5753     vector<int> up, acting;
5754     if (!osdmap.have_pg_pool(pgid.pool())) {
5755       ss << "pg '" << pgidstr << "' does not exist";
5756       r = -ENOENT;
5757       goto reply;
5758     }
5759     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5760     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5761     if (f) {
5762       f->open_object_section("pg_map");
5763       f->dump_unsigned("epoch", osdmap.get_epoch());
5764       f->dump_stream("raw_pgid") << pgid;
5765       f->dump_stream("pgid") << mpgid;
5766       f->open_array_section("up");
5767       for (auto osd : up) {
5768         f->dump_int("up_osd", osd);
5769       }
5770       f->close_section();
5771       f->open_array_section("acting");
5772       for (auto osd : acting) {
5773         f->dump_int("acting_osd", osd);
5774       }
5775       f->close_section();
5776       f->close_section();
5777       f->flush(rdata);
5778     } else {
5779       ds << "osdmap e" << osdmap.get_epoch()
5780          << " pg " << pgid << " (" << mpgid << ")"
5781          << " -> up " << up << " acting " << acting;
5782       rdata.append(ds);
5783     }
5784     goto reply;
5785
5786   } else if (prefix == "osd lspools") {
5787     if (f)
5788       f->open_array_section("pools");
5789     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5790          p != osdmap.pools.end();
5791          ++p) {
5792       if (f) {
5793         f->open_object_section("pool");
5794         f->dump_int("poolnum", p->first);
5795         f->dump_string("poolname", osdmap.pool_name[p->first]);
5796         f->close_section();
5797       } else {
5798         ds << p->first << ' ' << osdmap.pool_name[p->first];
5799         if (next(p) != osdmap.pools.end()) {
5800           ds << '\n';
5801         }
5802       }
5803     }
5804     if (f) {
5805       f->close_section();
5806       f->flush(ds);
5807     }
5808     rdata.append(ds);
5809   } else if (prefix == "osd blacklist ls") {
5810     if (f)
5811       f->open_array_section("blacklist");
5812
5813     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blacklist.begin();
5814          p != osdmap.blacklist.end();
5815          ++p) {
5816       if (f) {
5817         f->open_object_section("entry");
5818         f->dump_string("addr", p->first.get_legacy_str());
5819         f->dump_stream("until") << p->second;
5820         f->close_section();
5821       } else {
5822         stringstream ss;
5823         string s;
5824         ss << p->first << " " << p->second;
5825         getline(ss, s);
5826         s += "\n";
5827         rdata.append(s);
5828       }
5829     }
5830     if (f) {
5831       f->close_section();
5832       f->flush(rdata);
5833     }
5834     ss << "listed " << osdmap.blacklist.size() << " entries";
5835
5836   } else if (prefix == "osd pool ls") {
5837     string detail;
5838     cmd_getval(cmdmap, "detail", detail);
5839     if (!f && detail == "detail") {
5840       ostringstream ss;
5841       osdmap.print_pools(ss);
5842       rdata.append(ss.str());
5843     } else {
5844       if (f)
5845         f->open_array_section("pools");
5846       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
5847            it != osdmap.get_pools().end();
5848            ++it) {
5849         if (f) {
5850           if (detail == "detail") {
5851             f->open_object_section("pool");
5852             f->dump_int("pool_id", it->first);
5853             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5854             it->second.dump(f.get());
5855             f->close_section();
5856           } else {
5857             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
5858           }
5859         } else {
5860           rdata.append(osdmap.get_pool_name(it->first) + "\n");
5861         }
5862       }
5863       if (f) {
5864         f->close_section();
5865         f->flush(rdata);
5866       }
5867     }
5868
5869   } else if (prefix == "osd crush get-tunable") {
5870     string tunable;
5871     cmd_getval(cmdmap, "tunable", tunable);
5872     ostringstream rss;
5873     if (f)
5874       f->open_object_section("tunable");
5875     if (tunable == "straw_calc_version") {
5876       if (f)
5877         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
5878       else
5879         rss << osdmap.crush->get_straw_calc_version() << "\n";
5880     } else {
5881       r = -EINVAL;
5882       goto reply;
5883     }
5884     if (f) {
5885       f->close_section();
5886       f->flush(rdata);
5887     } else {
5888       rdata.append(rss.str());
5889     }
5890     r = 0;
5891
5892   } else if (prefix == "osd pool get") {
5893     string poolstr;
5894     cmd_getval(cmdmap, "pool", poolstr);
5895     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5896     if (pool < 0) {
5897       ss << "unrecognized pool '" << poolstr << "'";
5898       r = -ENOENT;
5899       goto reply;
5900     }
5901
5902     const pg_pool_t *p = osdmap.get_pg_pool(pool);
5903     string var;
5904     cmd_getval(cmdmap, "var", var);
5905
5906     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
5907     const choices_map_t ALL_CHOICES = {
5908       {"size", SIZE},
5909       {"min_size", MIN_SIZE},
5910       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
5911       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
5912       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
5913       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
5914       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
5915       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
5916       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
5917       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
5918       {"use_gmt_hitset", USE_GMT_HITSET},
5919       {"target_max_objects", TARGET_MAX_OBJECTS},
5920       {"target_max_bytes", TARGET_MAX_BYTES},
5921       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
5922       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
5923       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
5924       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
5925       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
5926       {"erasure_code_profile", ERASURE_CODE_PROFILE},
5927       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
5928       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
5929       {"fast_read", FAST_READ},
5930       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
5931       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
5932       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
5933       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
5934       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
5935       {"recovery_priority", RECOVERY_PRIORITY},
5936       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
5937       {"scrub_priority", SCRUB_PRIORITY},
5938       {"compression_mode", COMPRESSION_MODE},
5939       {"compression_algorithm", COMPRESSION_ALGORITHM},
5940       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
5941       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
5942       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
5943       {"csum_type", CSUM_TYPE},
5944       {"csum_max_block", CSUM_MAX_BLOCK},
5945       {"csum_min_block", CSUM_MIN_BLOCK},
5946       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
5947       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
5948       {"pg_num_min", PG_NUM_MIN},
5949       {"target_size_bytes", TARGET_SIZE_BYTES},
5950       {"target_size_ratio", TARGET_SIZE_RATIO},
5951       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
5952     };
5953
5954     typedef std::set<osd_pool_get_choices> choices_set_t;
5955
5956     const choices_set_t ONLY_TIER_CHOICES = {
5957       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5958       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
5959       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5960       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5961       MIN_READ_RECENCY_FOR_PROMOTE,
5962       MIN_WRITE_RECENCY_FOR_PROMOTE,
5963       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
5964     };
5965     const choices_set_t ONLY_ERASURE_CHOICES = {
5966       EC_OVERWRITES, ERASURE_CODE_PROFILE
5967     };
5968
5969     choices_set_t selected_choices;
5970     if (var == "all") {
5971       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
5972           it != ALL_CHOICES.end(); ++it) {
5973         selected_choices.insert(it->second);
5974       }
5975
5976       if(!p->is_tier()) {
5977         selected_choices = subtract_second_from_first(selected_choices,
5978                                                       ONLY_TIER_CHOICES);
5979       }
5980
5981       if(!p->is_erasure()) {
5982         selected_choices = subtract_second_from_first(selected_choices,
5983                                                       ONLY_ERASURE_CHOICES);
5984       }
5985     } else /* var != "all" */  {
5986       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
5987       osd_pool_get_choices selected = found->second;
5988
5989       if (!p->is_tier() &&
5990           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
5991         ss << "pool '" << poolstr
5992            << "' is not a tier pool: variable not applicable";
5993         r = -EACCES;
5994         goto reply;
5995       }
5996
5997       if (!p->is_erasure() &&
5998           ONLY_ERASURE_CHOICES.find(selected)
5999           != ONLY_ERASURE_CHOICES.end()) {
6000         ss << "pool '" << poolstr
6001            << "' is not a erasure pool: variable not applicable";
6002         r = -EACCES;
6003         goto reply;
6004       }
6005
6006       if (pool_opts_t::is_opt_name(var) &&
6007           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6008         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6009         r = -ENOENT;
6010         goto reply;
6011       }
6012
6013       selected_choices.insert(selected);
6014     }
6015
6016     if (f) {
6017       f->open_object_section("pool");
6018       f->dump_string("pool", poolstr);
6019       f->dump_int("pool_id", pool);
6020       for(choices_set_t::const_iterator it = selected_choices.begin();
6021           it != selected_choices.end(); ++it) {
6022         choices_map_t::const_iterator i;
6023         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6024           if (i->second == *it) {
6025             break;
6026           }
6027         }
6028         ceph_assert(i != ALL_CHOICES.end());
6029         switch(*it) {
6030           case PG_NUM:
6031             f->dump_int("pg_num", p->get_pg_num());
6032             break;
6033           case PGP_NUM:
6034             f->dump_int("pgp_num", p->get_pgp_num());
6035             break;
6036           case SIZE:
6037             f->dump_int("size", p->get_size());
6038             break;
6039           case MIN_SIZE:
6040             f->dump_int("min_size", p->get_min_size());
6041             break;
6042           case CRUSH_RULE:
6043             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6044               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6045                                p->get_crush_rule()));
6046             } else {
6047               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6048             }
6049             break;
6050           case EC_OVERWRITES:
6051             f->dump_bool("allow_ec_overwrites",
6052                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6053             break;
6054           case PG_AUTOSCALE_MODE:
6055             f->dump_string("pg_autoscale_mode",
6056                            pg_pool_t::get_pg_autoscale_mode_name(
6057                              p->pg_autoscale_mode));
6058             break;
6059           case HASHPSPOOL:
6060           case NODELETE:
6061           case NOPGCHANGE:
6062           case NOSIZECHANGE:
6063           case WRITE_FADVISE_DONTNEED:
6064           case NOSCRUB:
6065           case NODEEP_SCRUB:
6066             f->dump_bool(i->first.c_str(),
6067                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6068             break;
6069           case HIT_SET_PERIOD:
6070             f->dump_int("hit_set_period", p->hit_set_period);
6071             break;
6072           case HIT_SET_COUNT:
6073             f->dump_int("hit_set_count", p->hit_set_count);
6074             break;
6075           case HIT_SET_TYPE:
6076             f->dump_string("hit_set_type",
6077                            HitSet::get_type_name(p->hit_set_params.get_type()));
6078             break;
6079           case HIT_SET_FPP:
6080             {
6081               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6082                 BloomHitSet::Params *bloomp =
6083                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6084                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6085               } else if(var != "all") {
6086                 f->close_section();
6087                 ss << "hit set is not of type Bloom; " <<
6088                   "invalid to get a false positive rate!";
6089                 r = -EINVAL;
6090                 goto reply;
6091               }
6092             }
6093             break;
6094           case USE_GMT_HITSET:
6095             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6096             break;
6097           case TARGET_MAX_OBJECTS:
6098             f->dump_unsigned("target_max_objects", p->target_max_objects);
6099             break;
6100           case TARGET_MAX_BYTES:
6101             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6102             break;
6103           case CACHE_TARGET_DIRTY_RATIO:
6104             f->dump_unsigned("cache_target_dirty_ratio_micro",
6105                              p->cache_target_dirty_ratio_micro);
6106             f->dump_float("cache_target_dirty_ratio",
6107                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6108             break;
6109           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6110             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6111                              p->cache_target_dirty_high_ratio_micro);
6112             f->dump_float("cache_target_dirty_high_ratio",
6113                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6114             break;
6115           case CACHE_TARGET_FULL_RATIO:
6116             f->dump_unsigned("cache_target_full_ratio_micro",
6117                              p->cache_target_full_ratio_micro);
6118             f->dump_float("cache_target_full_ratio",
6119                           ((float)p->cache_target_full_ratio_micro/1000000));
6120             break;
6121           case CACHE_MIN_FLUSH_AGE:
6122             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6123             break;
6124           case CACHE_MIN_EVICT_AGE:
6125             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6126             break;
6127           case ERASURE_CODE_PROFILE:
6128             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6129             break;
6130           case MIN_READ_RECENCY_FOR_PROMOTE:
6131             f->dump_int("min_read_recency_for_promote",
6132                         p->min_read_recency_for_promote);
6133             break;
6134           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6135             f->dump_int("min_write_recency_for_promote",
6136                         p->min_write_recency_for_promote);
6137             break;
6138           case FAST_READ:
6139             f->dump_int("fast_read", p->fast_read);
6140             break;
6141           case HIT_SET_GRADE_DECAY_RATE:
6142             f->dump_int("hit_set_grade_decay_rate",
6143                         p->hit_set_grade_decay_rate);
6144             break;
6145           case HIT_SET_SEARCH_LAST_N:
6146             f->dump_int("hit_set_search_last_n",
6147                         p->hit_set_search_last_n);
6148             break;
6149           case SCRUB_MIN_INTERVAL:
6150           case SCRUB_MAX_INTERVAL:
6151           case DEEP_SCRUB_INTERVAL:
6152           case RECOVERY_PRIORITY:
6153           case RECOVERY_OP_PRIORITY:
6154           case SCRUB_PRIORITY:
6155           case COMPRESSION_MODE:
6156           case COMPRESSION_ALGORITHM:
6157           case COMPRESSION_REQUIRED_RATIO:
6158           case COMPRESSION_MAX_BLOB_SIZE:
6159           case COMPRESSION_MIN_BLOB_SIZE:
6160           case CSUM_TYPE:
6161           case CSUM_MAX_BLOCK:
6162           case CSUM_MIN_BLOCK:
6163           case FINGERPRINT_ALGORITHM:
6164           case PG_NUM_MIN:
6165           case TARGET_SIZE_BYTES:
6166           case TARGET_SIZE_RATIO:
6167           case PG_AUTOSCALE_BIAS:
6168             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6169             if (p->opts.is_set(key)) {
6170               if(*it == CSUM_TYPE) {
6171                 int64_t val;
6172                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6173                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6174               } else {
6175                 p->opts.dump(i->first, f.get());
6176               }
6177             }
6178             break;
6179         }
6180       }
6181       f->close_section();
6182       f->flush(rdata);
6183     } else /* !f */ {
6184       for(choices_set_t::const_iterator it = selected_choices.begin();
6185           it != selected_choices.end(); ++it) {
6186         choices_map_t::const_iterator i;
6187         switch(*it) {
6188           case PG_NUM:
6189             ss << "pg_num: " << p->get_pg_num() << "\n";
6190             break;
6191           case PGP_NUM:
6192             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6193             break;
6194           case SIZE:
6195             ss << "size: " << p->get_size() << "\n";
6196             break;
6197           case MIN_SIZE:
6198             ss << "min_size: " << p->get_min_size() << "\n";
6199             break;
6200           case CRUSH_RULE:
6201             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6202               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6203                 p->get_crush_rule()) << "\n";
6204             } else {
6205               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6206             }
6207             break;
6208           case PG_AUTOSCALE_MODE:
6209             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6210               p->pg_autoscale_mode) <<"\n";
6211             break;
6212           case HIT_SET_PERIOD:
6213             ss << "hit_set_period: " << p->hit_set_period << "\n";
6214             break;
6215           case HIT_SET_COUNT:
6216             ss << "hit_set_count: " << p->hit_set_count << "\n";
6217             break;
6218           case HIT_SET_TYPE:
6219             ss << "hit_set_type: " <<
6220               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6221             break;
6222           case HIT_SET_FPP:
6223             {
6224               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6225                 BloomHitSet::Params *bloomp =
6226                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6227                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6228               } else if(var != "all") {
6229                 ss << "hit set is not of type Bloom; " <<
6230                   "invalid to get a false positive rate!";
6231                 r = -EINVAL;
6232                 goto reply;
6233               }
6234             }
6235             break;
6236           case USE_GMT_HITSET:
6237             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6238             break;
6239           case TARGET_MAX_OBJECTS:
6240             ss << "target_max_objects: " << p->target_max_objects << "\n";
6241             break;
6242           case TARGET_MAX_BYTES:
6243             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6244             break;
6245           case CACHE_TARGET_DIRTY_RATIO:
6246             ss << "cache_target_dirty_ratio: "
6247                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6248             break;
6249           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6250             ss << "cache_target_dirty_high_ratio: "
6251                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6252             break;
6253           case CACHE_TARGET_FULL_RATIO:
6254             ss << "cache_target_full_ratio: "
6255                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6256             break;
6257           case CACHE_MIN_FLUSH_AGE:
6258             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6259             break;
6260           case CACHE_MIN_EVICT_AGE:
6261             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6262             break;
6263           case ERASURE_CODE_PROFILE:
6264             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6265             break;
6266           case MIN_READ_RECENCY_FOR_PROMOTE:
6267             ss << "min_read_recency_for_promote: " <<
6268               p->min_read_recency_for_promote << "\n";
6269             break;
6270           case HIT_SET_GRADE_DECAY_RATE:
6271             ss << "hit_set_grade_decay_rate: " <<
6272               p->hit_set_grade_decay_rate << "\n";
6273             break;
6274           case HIT_SET_SEARCH_LAST_N:
6275             ss << "hit_set_search_last_n: " <<
6276               p->hit_set_search_last_n << "\n";
6277             break;
6278           case EC_OVERWRITES:
6279             ss << "allow_ec_overwrites: " <<
6280               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6281               "\n";
6282             break;
6283           case HASHPSPOOL:
6284           case NODELETE:
6285           case NOPGCHANGE:
6286           case NOSIZECHANGE:
6287           case WRITE_FADVISE_DONTNEED:
6288           case NOSCRUB:
6289           case NODEEP_SCRUB:
6290             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6291               if (i->second == *it)
6292                 break;
6293             }
6294             ceph_assert(i != ALL_CHOICES.end());
6295             ss << i->first << ": " <<
6296               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6297                "true" : "false") << "\n";
6298             break;
6299           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6300             ss << "min_write_recency_for_promote: " <<
6301               p->min_write_recency_for_promote << "\n";
6302             break;
6303           case FAST_READ:
6304             ss << "fast_read: " << p->fast_read << "\n";
6305             break;
6306           case SCRUB_MIN_INTERVAL:
6307           case SCRUB_MAX_INTERVAL:
6308           case DEEP_SCRUB_INTERVAL:
6309           case RECOVERY_PRIORITY:
6310           case RECOVERY_OP_PRIORITY:
6311           case SCRUB_PRIORITY:
6312           case COMPRESSION_MODE:
6313           case COMPRESSION_ALGORITHM:
6314           case COMPRESSION_REQUIRED_RATIO:
6315           case COMPRESSION_MAX_BLOB_SIZE:
6316           case COMPRESSION_MIN_BLOB_SIZE:
6317           case CSUM_TYPE:
6318           case CSUM_MAX_BLOCK:
6319           case CSUM_MIN_BLOCK:
6320           case FINGERPRINT_ALGORITHM:
6321           case PG_NUM_MIN:
6322           case TARGET_SIZE_BYTES:
6323           case TARGET_SIZE_RATIO:
6324           case PG_AUTOSCALE_BIAS:
6325             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6326               if (i->second == *it)
6327                 break;
6328             }
6329             ceph_assert(i != ALL_CHOICES.end());
6330             {
6331               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6332               if (p->opts.is_set(key)) {
6333                 if(key == pool_opts_t::CSUM_TYPE) {
6334                   int64_t val;
6335                   p->opts.get(key, &val);
6336                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6337                 } else {
6338                   ss << i->first << ": " << p->opts.get(key) << "\n";
6339                 }
6340               }
6341             }
6342             break;
6343         }
6344         rdata.append(ss.str());
6345         ss.str("");
6346       }
6347     }
6348     r = 0;
6349   } else if (prefix == "osd pool get-quota") {
6350     string pool_name;
6351     cmd_getval(cmdmap, "pool", pool_name);
6352
6353     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6354     if (poolid < 0) {
6355       ceph_assert(poolid == -ENOENT);
6356       ss << "unrecognized pool '" << pool_name << "'";
6357       r = -ENOENT;
6358       goto reply;
6359     }
6360     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6361     const pool_stat_t* pstat = mon->mgrstatmon()->get_pool_stat(poolid);
6362     const object_stat_sum_t& sum = pstat->stats.sum;
6363     if (f) {
6364       f->open_object_section("pool_quotas");
6365       f->dump_string("pool_name", pool_name);
6366       f->dump_unsigned("pool_id", poolid);
6367       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6368       f->dump_int("current_num_objects", sum.num_objects);
6369       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6370       f->dump_int("current_num_bytes", sum.num_bytes);
6371       f->close_section();
6372       f->flush(rdata);
6373     } else {
6374       stringstream rs;
6375       rs << "quotas for pool '" << pool_name << "':\n"
6376          << "  max objects: ";
6377       if (p->quota_max_objects == 0)
6378         rs << "N/A";
6379       else {
6380         rs << si_u_t(p->quota_max_objects) << " objects";
6381         rs << "  (current num objects: " << sum.num_objects << " objects)";
6382       }
6383       rs << "\n"
6384          << "  max bytes  : ";
6385       if (p->quota_max_bytes == 0)
6386         rs << "N/A";
6387       else {
6388         rs << byte_u_t(p->quota_max_bytes);
6389         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6390       }
6391       rdata.append(rs.str());
6392     }
6393     rdata.append("\n");
6394     r = 0;
6395   } else if (prefix == "osd crush rule list" ||
6396              prefix == "osd crush rule ls") {
6397     if (f) {
6398       f->open_array_section("rules");
6399       osdmap.crush->list_rules(f.get());
6400       f->close_section();
6401       f->flush(rdata);
6402     } else {
6403       ostringstream ss;
6404       osdmap.crush->list_rules(&ss);
6405       rdata.append(ss.str());
6406     }
6407   } else if (prefix == "osd crush rule ls-by-class") {
6408     string class_name;
6409     cmd_getval(cmdmap, "class", class_name);
6410     if (class_name.empty()) {
6411       ss << "no class specified";
6412       r = -EINVAL;
6413       goto reply;
6414     }
6415     set<int> rules;
6416     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6417     if (r < 0) {
6418       ss << "failed to get rules by class '" << class_name << "'";
6419       goto reply;
6420     }
6421     if (f) {
6422       f->open_array_section("rules");
6423       for (auto &rule: rules) {
6424         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6425       }
6426       f->close_section();
6427       f->flush(rdata);
6428     } else {
6429       ostringstream rs;
6430       for (auto &rule: rules) {
6431         rs << osdmap.crush->get_rule_name(rule) << "\n";
6432       }
6433       rdata.append(rs.str());
6434     }
6435   } else if (prefix == "osd crush rule dump") {
6436     string name;
6437     cmd_getval(cmdmap, "name", name);
6438     string format;
6439     cmd_getval(cmdmap, "format", format);
6440     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6441     if (name == "") {
6442       f->open_array_section("rules");
6443       osdmap.crush->dump_rules(f.get());
6444       f->close_section();
6445     } else {
6446       int ruleno = osdmap.crush->get_rule_id(name);
6447       if (ruleno < 0) {
6448         ss << "unknown crush rule '" << name << "'";
6449         r = ruleno;
6450         goto reply;
6451       }
6452       osdmap.crush->dump_rule(ruleno, f.get());
6453     }
6454     ostringstream rs;
6455     f->flush(rs);
6456     rs << "\n";
6457     rdata.append(rs.str());
6458   } else if (prefix == "osd crush dump") {
6459     string format;
6460     cmd_getval(cmdmap, "format", format);
6461     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6462     f->open_object_section("crush_map");
6463     osdmap.crush->dump(f.get());
6464     f->close_section();
6465     ostringstream rs;
6466     f->flush(rs);
6467     rs << "\n";
6468     rdata.append(rs.str());
6469   } else if (prefix == "osd crush show-tunables") {
6470     string format;
6471     cmd_getval(cmdmap, "format", format);
6472     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6473     f->open_object_section("crush_map_tunables");
6474     osdmap.crush->dump_tunables(f.get());
6475     f->close_section();
6476     ostringstream rs;
6477     f->flush(rs);
6478     rs << "\n";
6479     rdata.append(rs.str());
6480   } else if (prefix == "osd crush tree") {
6481     string shadow;
6482     cmd_getval(cmdmap, "shadow", shadow);
6483     bool show_shadow = shadow == "--show-shadow";
6484     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6485     if (f) {
6486       f->open_object_section("crush_tree");
6487       osdmap.crush->dump_tree(nullptr,
6488                               f.get(),
6489                               osdmap.get_pool_names(),
6490                               show_shadow);
6491       f->close_section();
6492       f->flush(rdata);
6493     } else {
6494       ostringstream ss;
6495       osdmap.crush->dump_tree(&ss,
6496                               nullptr,
6497                               osdmap.get_pool_names(),
6498                               show_shadow);
6499       rdata.append(ss.str());
6500     }
6501   } else if (prefix == "osd crush ls") {
6502     string name;
6503     if (!cmd_getval(cmdmap, "node", name)) {
6504       ss << "no node specified";
6505       r = -EINVAL;
6506       goto reply;
6507     }
6508     if (!osdmap.crush->name_exists(name)) {
6509       ss << "node '" << name << "' does not exist";
6510       r = -ENOENT;
6511       goto reply;
6512     }
6513     int id = osdmap.crush->get_item_id(name);
6514     list<int> result;
6515     if (id >= 0) {
6516       result.push_back(id);
6517     } else {
6518       int num = osdmap.crush->get_bucket_size(id);
6519       for (int i = 0; i < num; ++i) {
6520         result.push_back(osdmap.crush->get_bucket_item(id, i));
6521       }
6522     }
6523     if (f) {
6524       f->open_array_section("items");
6525       for (auto i : result) {
6526         f->dump_string("item", osdmap.crush->get_item_name(i));
6527       }
6528       f->close_section();
6529       f->flush(rdata);
6530     } else {
6531       ostringstream ss;
6532       for (auto i : result) {
6533         ss << osdmap.crush->get_item_name(i) << "\n";
6534       }
6535       rdata.append(ss.str());
6536     }
6537     r = 0;
6538   } else if (prefix == "osd crush class ls") {
6539     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6540     f->open_array_section("crush_classes");
6541     for (auto i : osdmap.crush->class_name)
6542       f->dump_string("class", i.second);
6543     f->close_section();
6544     f->flush(rdata);
6545   } else if (prefix == "osd crush class ls-osd") {
6546     string name;
6547     cmd_getval(cmdmap, "class", name);
6548     set<int> osds;
6549     osdmap.crush->get_devices_by_class(name, &osds);
6550     if (f) {
6551       f->open_array_section("osds");
6552       for (auto &osd: osds)
6553         f->dump_int("osd", osd);
6554       f->close_section();
6555       f->flush(rdata);
6556     } else {
6557       bool first = true;
6558       for (auto &osd : osds) {
6559         if (!first)
6560           ds << "\n";
6561         first = false;
6562         ds << osd;
6563       }
6564       rdata.append(ds);
6565     }
6566   } else if (prefix == "osd crush get-device-class") {
6567     vector<string> idvec;
6568     cmd_getval(cmdmap, "ids", idvec);
6569     map<int, string> class_by_osd;
6570     for (auto& id : idvec) {
6571       ostringstream ts;
6572       long osd = parse_osd_id(id.c_str(), &ts);
6573       if (osd < 0) {
6574         ss << "unable to parse osd id:'" << id << "'";
6575         r = -EINVAL;
6576         goto reply;
6577       }
6578       auto device_class = osdmap.crush->get_item_class(osd);
6579       if (device_class)
6580         class_by_osd[osd] = device_class;
6581       else
6582         class_by_osd[osd] = ""; // no class
6583     }
6584     if (f) {
6585       f->open_array_section("osd_device_classes");
6586       for (auto& i : class_by_osd) {
6587         f->open_object_section("osd_device_class");
6588         f->dump_int("osd", i.first);
6589         f->dump_string("device_class", i.second);
6590         f->close_section();
6591       }
6592       f->close_section();
6593       f->flush(rdata);
6594     } else {
6595       if (class_by_osd.size() == 1) {
6596         // for single input, make a clean output
6597         ds << class_by_osd.begin()->second;
6598       } else {
6599         // note that we do not group osds by class here
6600         for (auto it = class_by_osd.begin();
6601              it != class_by_osd.end();
6602              it++) {
6603           ds << "osd." << it->first << ' ' << it->second;
6604           if (next(it) != class_by_osd.end())
6605             ds << '\n';
6606         }
6607       }
6608       rdata.append(ds);
6609     }
6610   } else if (prefix == "osd erasure-code-profile ls") {
6611     const auto &profiles = osdmap.get_erasure_code_profiles();
6612     if (f)
6613       f->open_array_section("erasure-code-profiles");
6614     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6615       if (f)
6616         f->dump_string("profile", i->first.c_str());
6617       else
6618         rdata.append(i->first + "\n");
6619     }
6620     if (f) {
6621       f->close_section();
6622       ostringstream rs;
6623       f->flush(rs);
6624       rs << "\n";
6625       rdata.append(rs.str());
6626     }
6627   } else if (prefix == "osd crush weight-set ls") {
6628     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6629     if (f) {
6630       f->open_array_section("weight_sets");
6631       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6632         f->dump_string("pool", "(compat)");
6633       }
6634       for (auto& i : osdmap.crush->choose_args) {
6635         if (i.first >= 0) {
6636           f->dump_string("pool", osdmap.get_pool_name(i.first));
6637         }
6638       }
6639       f->close_section();
6640       f->flush(rdata);
6641     } else {
6642       ostringstream rs;
6643       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6644         rs << "(compat)\n";
6645       }
6646       for (auto& i : osdmap.crush->choose_args) {
6647         if (i.first >= 0) {
6648           rs << osdmap.get_pool_name(i.first) << "\n";
6649         }
6650       }
6651       rdata.append(rs.str());
6652     }
6653   } else if (prefix == "osd crush weight-set dump") {
6654     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6655                                                      "json-pretty"));
6656     osdmap.crush->dump_choose_args(f.get());
6657     f->flush(rdata);
6658   } else if (prefix == "osd erasure-code-profile get") {
6659     string name;
6660     cmd_getval(cmdmap, "name", name);
6661     if (!osdmap.has_erasure_code_profile(name)) {
6662       ss << "unknown erasure code profile '" << name << "'";
6663       r = -ENOENT;
6664       goto reply;
6665     }
6666     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6667     if (f)
6668       f->open_object_section("profile");
6669     for (map<string,string>::const_iterator i = profile.begin();
6670          i != profile.end();
6671          ++i) {
6672       if (f)
6673         f->dump_string(i->first.c_str(), i->second.c_str());
6674       else
6675         rdata.append(i->first + "=" + i->second + "\n");
6676     }
6677     if (f) {
6678       f->close_section();
6679       ostringstream rs;
6680       f->flush(rs);
6681       rs << "\n";
6682       rdata.append(rs.str());
6683     }
6684   } else if (prefix == "osd pool application get") {
6685     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6686                                                      "json-pretty"));
6687     string pool_name;
6688     cmd_getval(cmdmap, "pool", pool_name);
6689     string app;
6690     cmd_getval(cmdmap, "app", app);
6691     string key;
6692     cmd_getval(cmdmap, "key", key);
6693
6694     if (pool_name.empty()) {
6695       // all
6696       f->open_object_section("pools");
6697       for (const auto &pool : osdmap.pools) {
6698         std::string name("<unknown>");
6699         const auto &pni = osdmap.pool_name.find(pool.first);
6700         if (pni != osdmap.pool_name.end())
6701           name = pni->second;
6702         f->open_object_section(name.c_str());
6703         for (auto &app_pair : pool.second.application_metadata) {
6704           f->open_object_section(app_pair.first.c_str());
6705           for (auto &kv_pair : app_pair.second) {
6706             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6707           }
6708           f->close_section();
6709         }
6710         f->close_section(); // name
6711       }
6712       f->close_section(); // pools
6713       f->flush(rdata);
6714     } else {
6715       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6716       if (pool < 0) {
6717         ss << "unrecognized pool '" << pool_name << "'";
6718         r = -ENOENT;
6719         goto reply;
6720       }
6721       auto p = osdmap.get_pg_pool(pool);
6722       // filter by pool
6723       if (app.empty()) {
6724         f->open_object_section(pool_name.c_str());
6725         for (auto &app_pair : p->application_metadata) {
6726           f->open_object_section(app_pair.first.c_str());
6727           for (auto &kv_pair : app_pair.second) {
6728             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6729           }
6730           f->close_section(); // application
6731         }
6732         f->close_section(); // pool_name
6733         f->flush(rdata);
6734         goto reply;
6735       }
6736
6737       auto app_it = p->application_metadata.find(app);
6738       if (app_it == p->application_metadata.end()) {
6739         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6740         r = -ENOENT;
6741         goto reply;
6742       }
6743       // filter by pool + app
6744       if (key.empty()) {
6745         f->open_object_section(app_it->first.c_str());
6746         for (auto &kv_pair : app_it->second) {
6747           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6748         }
6749         f->close_section(); // application
6750         f->flush(rdata);
6751         goto reply;
6752       }
6753       // filter by pool + app + key
6754       auto key_it = app_it->second.find(key);
6755       if (key_it == app_it->second.end()) {
6756         ss << "application '" << app << "' on pool '" << pool_name
6757            << "' does not have key '" << key << "'";
6758         r = -ENOENT;
6759         goto reply;
6760       }
6761       ss << key_it->second << "\n";
6762       rdata.append(ss.str());
6763       ss.str("");
6764     }
6765   } else if (prefix == "osd get-require-min-compat-client") {
6766     ss << osdmap.require_min_compat_client << std::endl;
6767     rdata.append(ss.str());
6768     ss.str("");
6769     goto reply;
6770   } else if (prefix == "osd pool application enable" ||
6771              prefix == "osd pool application disable" ||
6772              prefix == "osd pool application set" ||
6773              prefix == "osd pool application rm") {
6774     bool changed = false;
6775     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6776     if (r != 0) {
6777       // Error, reply.
6778       goto reply;
6779     } else if (changed) {
6780       // Valid mutation, proceed to prepare phase
6781       return false;
6782     } else {
6783       // Idempotent case, reply
6784       goto reply;
6785     }
6786   } else {
6787     // try prepare update
6788     return false;
6789   }
6790
6791  reply:
6792   string rs;
6793   getline(ss, rs);
6794   mon->reply_command(op, r, rs, rdata, get_last_committed());
6795   return true;
6796 }
6797
6798 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6799 {
6800   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6801     osdmap.get_pg_pool(pool_id));
6802   ceph_assert(pool);
6803   pool->set_flag(flags);
6804 }
6805
6806 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6807 {
6808   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6809     osdmap.get_pg_pool(pool_id));
6810   ceph_assert(pool);
6811   pool->unset_flag(flags);
6812 }
6813
6814 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
6815 {
6816   char k[80];
6817   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
6818   return k;
6819 }
6820
6821 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
6822 {
6823   char k[80];
6824   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
6825            (unsigned long long)pool, (unsigned long long)snap);
6826   return k;
6827 }
6828
6829 string OSDMonitor::make_purged_snap_key_value(
6830   int64_t pool, snapid_t snap, snapid_t num,
6831   epoch_t epoch, bufferlist *v)
6832 {
6833   // encode the *last* epoch in the key so that we can use forward
6834   // iteration only to search for an epoch in an interval.
6835   encode(snap, *v);
6836   encode(snap + num, *v);
6837   encode(epoch, *v);
6838   return make_purged_snap_key(pool, snap + num - 1);
6839 }
6840
6841
6842 int OSDMonitor::lookup_purged_snap(
6843   int64_t pool, snapid_t snap,
6844   snapid_t *begin, snapid_t *end)
6845 {
6846   string k = make_purged_snap_key(pool, snap);
6847   auto it = mon->store->get_iterator(OSD_SNAP_PREFIX);
6848   it->lower_bound(k);
6849   if (!it->valid()) {
6850     dout(20) << __func__
6851              << " pool " << pool << " snap " << snap
6852              << " - key '" << k << "' not found" << dendl;
6853     return -ENOENT;
6854   }
6855   if (it->key().find("purged_snap_") != 0) {
6856     dout(20) << __func__
6857              << " pool " << pool << " snap " << snap
6858              << " - key '" << k << "' got '" << it->key()
6859              << "', wrong prefix" << dendl;
6860     return -ENOENT;
6861   }
6862   string gotk = it->key();
6863   const char *format = "purged_snap_%llu_";
6864   long long int keypool;
6865   int n = sscanf(gotk.c_str(), format, &keypool);
6866   if (n != 1) {
6867     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
6868     return -ENOENT;
6869   }
6870   if (pool != keypool) {
6871     dout(20) << __func__
6872              << " pool " << pool << " snap " << snap
6873              << " - key '" << k << "' got '" << gotk
6874              << "', wrong pool " << keypool
6875              << dendl;
6876     return -ENOENT;
6877   }
6878   bufferlist v = it->value();
6879   auto p = v.cbegin();
6880   decode(*begin, p);
6881   decode(*end, p);
6882   if (snap < *begin || snap >= *end) {
6883     dout(20) << __func__
6884              << " pool " << pool << " snap " << snap
6885              << " - found [" << *begin << "," << *end << "), no overlap"
6886              << dendl;
6887     return -ENOENT;
6888   }
6889   return 0;
6890 }
6891
6892 void OSDMonitor::insert_purged_snap_update(
6893   int64_t pool,
6894   snapid_t start, snapid_t end,
6895   epoch_t epoch,
6896   MonitorDBStore::TransactionRef t)
6897 {
6898   snapid_t before_begin, before_end;
6899   snapid_t after_begin, after_end;
6900   int b = lookup_purged_snap(pool, start - 1,
6901                              &before_begin, &before_end);
6902   int a = lookup_purged_snap(pool, end,
6903                              &after_begin, &after_end);
6904   if (!b && !a) {
6905     dout(10) << __func__
6906              << " [" << start << "," << end << ") - joins ["
6907              << before_begin << "," << before_end << ") and ["
6908              << after_begin << "," << after_end << ")" << dendl;
6909     // erase only the begin record; we'll overwrite the end one.
6910     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6911     bufferlist v;
6912     string k = make_purged_snap_key_value(pool,
6913                                           before_begin, after_end - before_begin,
6914                                           pending_inc.epoch, &v);
6915     t->put(OSD_SNAP_PREFIX, k, v);
6916   } else if (!b) {
6917     dout(10) << __func__
6918              << " [" << start << "," << end << ") - join with earlier ["
6919              << before_begin << "," << before_end << ")" << dendl;
6920     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
6921     bufferlist v;
6922     string k = make_purged_snap_key_value(pool,
6923                                           before_begin, end - before_begin,
6924                                           pending_inc.epoch, &v);
6925     t->put(OSD_SNAP_PREFIX, k, v);
6926   } else if (!a) {
6927     dout(10) << __func__
6928              << " [" << start << "," << end << ") - join with later ["
6929              << after_begin << "," << after_end << ")" << dendl;
6930     // overwrite after record
6931     bufferlist v;
6932     string k = make_purged_snap_key_value(pool,
6933                                           start, after_end - start,
6934                                           pending_inc.epoch, &v);
6935     t->put(OSD_SNAP_PREFIX, k, v);
6936   } else {
6937     dout(10) << __func__
6938              << " [" << start << "," << end << ") - new"
6939              << dendl;
6940     bufferlist v;
6941     string k = make_purged_snap_key_value(pool,
6942                                           start, end - start,
6943                                           pending_inc.epoch, &v);
6944     t->put(OSD_SNAP_PREFIX, k, v);
6945   }
6946 }
6947
6948 bool OSDMonitor::try_prune_purged_snaps()
6949 {
6950   if (!mon->mgrstatmon()->is_readable()) {
6951     return false;
6952   }
6953   if (!pending_inc.new_purged_snaps.empty()) {
6954     return false;  // we already pruned for this epoch
6955   }
6956
6957   unsigned max_prune = cct->_conf.get_val<uint64_t>(
6958     "mon_max_snap_prune_per_epoch");
6959   if (!max_prune) {
6960     max_prune = 100000;
6961   }
6962   dout(10) << __func__ << " max_prune " << max_prune << dendl;
6963
6964   unsigned actually_pruned = 0;
6965   auto& purged_snaps = mon->mgrstatmon()->get_digest().purged_snaps;
6966   for (auto& p : osdmap.get_pools()) {
6967     auto q = purged_snaps.find(p.first);
6968     if (q == purged_snaps.end()) {
6969       continue;
6970     }
6971     auto& purged = q->second;
6972     if (purged.empty()) {
6973       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
6974       continue;
6975     }
6976     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
6977     snap_interval_set_t to_prune;
6978     unsigned maybe_pruned = actually_pruned;
6979     for (auto i = purged.begin(); i != purged.end(); ++i) {
6980       snapid_t begin = i.get_start();
6981       auto end = i.get_start() + i.get_len();
6982       snapid_t pbegin = 0, pend = 0;
6983       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
6984       if (r == 0) {
6985         // already purged.
6986         // be a bit aggressive about backing off here, because the mon may
6987         // do a lot of work going through this set, and if we know the
6988         // purged set from the OSDs is at least *partly* stale we may as
6989         // well wait for it to be fresh.
6990         dout(20) << __func__ << "  we've already purged " << pbegin
6991                  << "~" << (pend - pbegin) << dendl;
6992         break;  // next pool
6993       }
6994       if (pbegin && pbegin > begin && pbegin < end) {
6995         // the tail of [begin,end) is purged; shorten the range
6996         end = pbegin;
6997       }
6998       to_prune.insert(begin, end - begin);
6999       maybe_pruned += end - begin;
7000       if (maybe_pruned >= max_prune) {
7001         break;
7002       }
7003     }
7004     if (!to_prune.empty()) {
7005       // PGs may still be reporting things as purged that we have already
7006       // pruned from removed_snaps_queue.
7007       snap_interval_set_t actual;
7008       auto r = osdmap.removed_snaps_queue.find(p.first);
7009       if (r != osdmap.removed_snaps_queue.end()) {
7010         actual.intersection_of(to_prune, r->second);
7011       }
7012       actually_pruned += actual.size();
7013       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7014                << ", actual pruned " << actual << dendl;
7015       if (!actual.empty()) {
7016         pending_inc.new_purged_snaps[p.first].swap(actual);
7017       }
7018     }
7019     if (actually_pruned >= max_prune) {
7020       break;
7021     }
7022   }
7023   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7024   return !!actually_pruned;
7025 }
7026
7027 bool OSDMonitor::update_pools_status()
7028 {
7029   if (!mon->mgrstatmon()->is_readable())
7030     return false;
7031
7032   bool ret = false;
7033
7034   auto& pools = osdmap.get_pools();
7035   for (auto it = pools.begin(); it != pools.end(); ++it) {
7036     const pool_stat_t *pstat = mon->mgrstatmon()->get_pool_stat(it->first);
7037     if (!pstat)
7038       continue;
7039     const object_stat_sum_t& sum = pstat->stats.sum;
7040     const pg_pool_t &pool = it->second;
7041     const string& pool_name = osdmap.get_pool_name(it->first);
7042
7043     bool pool_is_full =
7044       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7045       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7046
7047     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7048       if (pool_is_full)
7049         continue;
7050
7051       mon->clog->info() << "pool '" << pool_name
7052                        << "' no longer out of quota; removing NO_QUOTA flag";
7053       // below we cancel FLAG_FULL too, we'll set it again in
7054       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7055       clear_pool_flags(it->first,
7056                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7057       ret = true;
7058     } else {
7059       if (!pool_is_full)
7060         continue;
7061
7062       if (pool.quota_max_bytes > 0 &&
7063           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7064         mon->clog->warn() << "pool '" << pool_name << "' is full"
7065                          << " (reached quota's max_bytes: "
7066                          << byte_u_t(pool.quota_max_bytes) << ")";
7067       }
7068       if (pool.quota_max_objects > 0 &&
7069                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7070         mon->clog->warn() << "pool '" << pool_name << "' is full"
7071                          << " (reached quota's max_objects: "
7072                          << pool.quota_max_objects << ")";
7073       }
7074       // set both FLAG_FULL_QUOTA and FLAG_FULL
7075       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7076       // since FLAG_FULL should always take precedence
7077       set_pool_flags(it->first,
7078                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7079       clear_pool_flags(it->first,
7080                        pg_pool_t::FLAG_NEARFULL |
7081                        pg_pool_t::FLAG_BACKFILLFULL);
7082       ret = true;
7083     }
7084   }
7085   return ret;
7086 }
7087
7088 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7089 {
7090   op->mark_osdmon_event(__func__);
7091   auto m = op->get_req<MPoolOp>();
7092   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7093   MonSession *session = op->get_session();
7094   if (!session)
7095     return -EPERM;
7096   string erasure_code_profile;
7097   stringstream ss;
7098   string rule_name;
7099   int ret = 0;
7100   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7101                          0, 0, 0, 0, 0, 0.0,
7102                          erasure_code_profile,
7103                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7104                          &ss);
7105
7106   if (ret < 0) {
7107     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7108   }
7109   return ret;
7110 }
7111
7112 int OSDMonitor::crush_rename_bucket(const string& srcname,
7113                                     const string& dstname,
7114                                     ostream *ss)
7115 {
7116   int ret;
7117   //
7118   // Avoid creating a pending crush if it does not already exists and
7119   // the rename would fail.
7120   //
7121   if (!_have_pending_crush()) {
7122     ret = _get_stable_crush().can_rename_bucket(srcname,
7123                                                 dstname,
7124                                                 ss);
7125     if (ret)
7126       return ret;
7127   }
7128
7129   CrushWrapper newcrush;
7130   _get_pending_crush(newcrush);
7131
7132   ret = newcrush.rename_bucket(srcname,
7133                                dstname,
7134                                ss);
7135   if (ret)
7136     return ret;
7137
7138   pending_inc.crush.clear();
7139   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7140   *ss << "renamed bucket " << srcname << " into " << dstname;
7141   return 0;
7142 }
7143
7144 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7145 {
7146   string replacement = "";
7147
7148   if (plugin == "jerasure_generic" ||
7149       plugin == "jerasure_sse3" ||
7150       plugin == "jerasure_sse4" ||
7151       plugin == "jerasure_neon") {
7152     replacement = "jerasure";
7153   } else if (plugin == "shec_generic" ||
7154              plugin == "shec_sse3" ||
7155              plugin == "shec_sse4" ||
7156              plugin == "shec_neon") {
7157     replacement = "shec";
7158   }
7159
7160   if (replacement != "") {
7161     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7162             << plugin << " that has been deprecated. Please use "
7163             << replacement << " instead." << dendl;
7164   }
7165 }
7166
7167 int OSDMonitor::normalize_profile(const string& profilename,
7168                                   ErasureCodeProfile &profile,
7169                                   bool force,
7170                                   ostream *ss)
7171 {
7172   ErasureCodeInterfaceRef erasure_code;
7173   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7174   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7175   check_legacy_ec_plugin(plugin->second, profilename);
7176   int err = instance.factory(plugin->second,
7177                              g_conf().get_val<std::string>("erasure_code_dir"),
7178                              profile, &erasure_code, ss);
7179   if (err) {
7180     return err;
7181   }
7182
7183   err = erasure_code->init(profile, ss);
7184   if (err) {
7185     return err;
7186   }
7187
7188   auto it = profile.find("stripe_unit");
7189   if (it != profile.end()) {
7190     string err_str;
7191     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7192     if (!err_str.empty()) {
7193       *ss << "could not parse stripe_unit '" << it->second
7194           << "': " << err_str << std::endl;
7195       return -EINVAL;
7196     }
7197     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7198     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7199     if (chunk_size != stripe_unit) {
7200       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7201           << "alignment. Would be padded to " << chunk_size
7202           << std::endl;
7203       return -EINVAL;
7204     }
7205     if ((stripe_unit % 4096) != 0 && !force) {
7206       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7207           << "use --force to override this check" << std::endl;
7208       return -EINVAL;
7209     }
7210   }
7211   return 0;
7212 }
7213
7214 int OSDMonitor::crush_rule_create_erasure(const string &name,
7215                                              const string &profile,
7216                                              int *rule,
7217                                              ostream *ss)
7218 {
7219   int ruleid = osdmap.crush->get_rule_id(name);
7220   if (ruleid != -ENOENT) {
7221     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7222     return -EEXIST;
7223   }
7224
7225   CrushWrapper newcrush;
7226   _get_pending_crush(newcrush);
7227
7228   ruleid = newcrush.get_rule_id(name);
7229   if (ruleid != -ENOENT) {
7230     *rule = newcrush.get_rule_mask_ruleset(ruleid);
7231     return -EALREADY;
7232   } else {
7233     ErasureCodeInterfaceRef erasure_code;
7234     int err = get_erasure_code(profile, &erasure_code, ss);
7235     if (err) {
7236       *ss << "failed to load plugin using profile " << profile << std::endl;
7237       return err;
7238     }
7239
7240     err = erasure_code->create_rule(name, newcrush, ss);
7241     erasure_code.reset();
7242     if (err < 0)
7243       return err;
7244     *rule = err;
7245     pending_inc.crush.clear();
7246     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
7247     return 0;
7248   }
7249 }
7250
7251 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7252                                  ErasureCodeInterfaceRef *erasure_code,
7253                                  ostream *ss) const
7254 {
7255   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7256     return -EAGAIN;
7257   ErasureCodeProfile profile =
7258     osdmap.get_erasure_code_profile(erasure_code_profile);
7259   ErasureCodeProfile::const_iterator plugin =
7260     profile.find("plugin");
7261   if (plugin == profile.end()) {
7262     *ss << "cannot determine the erasure code plugin"
7263         << " because there is no 'plugin' entry in the erasure_code_profile "
7264         << profile << std::endl;
7265     return -EINVAL;
7266   }
7267   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7268   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7269   return instance.factory(plugin->second,
7270                           g_conf().get_val<std::string>("erasure_code_dir"),
7271                           profile, erasure_code, ss);
7272 }
7273
7274 int OSDMonitor::check_cluster_features(uint64_t features,
7275                                        stringstream &ss)
7276 {
7277   stringstream unsupported_ss;
7278   int unsupported_count = 0;
7279   if ((mon->get_quorum_con_features() & features) != features) {
7280     unsupported_ss << "the monitor cluster";
7281     ++unsupported_count;
7282   }
7283
7284   set<int32_t> up_osds;
7285   osdmap.get_up_osds(up_osds);
7286   for (set<int32_t>::iterator it = up_osds.begin();
7287        it != up_osds.end(); ++it) {
7288     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7289     if ((xi.features & features) != features) {
7290       if (unsupported_count > 0)
7291         unsupported_ss << ", ";
7292       unsupported_ss << "osd." << *it;
7293       unsupported_count ++;
7294     }
7295   }
7296
7297   if (unsupported_count > 0) {
7298     ss << "features " << features << " unsupported by: "
7299        << unsupported_ss.str();
7300     return -ENOTSUP;
7301   }
7302
7303   // check pending osd state, too!
7304   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7305          pending_inc.new_xinfo.begin();
7306        p != pending_inc.new_xinfo.end(); ++p) {
7307     const osd_xinfo_t &xi = p->second;
7308     if ((xi.features & features) != features) {
7309       dout(10) << __func__ << " pending osd." << p->first
7310                << " features are insufficient; retry" << dendl;
7311       return -EAGAIN;
7312     }
7313   }
7314
7315   return 0;
7316 }
7317
7318 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7319                                                  stringstream& ss)
7320 {
7321   OSDMap::Incremental new_pending = pending_inc;
7322   encode(*newcrush, new_pending.crush, mon->get_quorum_con_features());
7323   OSDMap newmap;
7324   newmap.deepish_copy_from(osdmap);
7325   newmap.apply_incremental(new_pending);
7326
7327   // client compat
7328   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7329     auto mv = newmap.get_min_compat_client();
7330     if (mv > newmap.require_min_compat_client) {
7331       ss << "new crush map requires client version " << mv
7332          << " but require_min_compat_client is "
7333          << newmap.require_min_compat_client;
7334       return false;
7335     }
7336   }
7337
7338   // osd compat
7339   uint64_t features =
7340     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7341     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7342   stringstream features_ss;
7343   int r = check_cluster_features(features, features_ss);
7344   if (r) {
7345     ss << "Could not change CRUSH: " << features_ss.str();
7346     return false;
7347   }
7348
7349   return true;
7350 }
7351
7352 bool OSDMonitor::erasure_code_profile_in_use(
7353   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7354   const string &profile,
7355   ostream *ss)
7356 {
7357   bool found = false;
7358   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7359        p != pools.end();
7360        ++p) {
7361     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7362       *ss << osdmap.pool_name[p->first] << " ";
7363       found = true;
7364     }
7365   }
7366   if (found) {
7367     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7368   }
7369   return found;
7370 }
7371
7372 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7373                                            map<string,string> *erasure_code_profile_map,
7374                                            ostream *ss)
7375 {
7376   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7377                                    get_json_str_map,
7378                                    *ss,
7379                                    erasure_code_profile_map,
7380                                    true);
7381   if (r)
7382     return r;
7383   ceph_assert((*erasure_code_profile_map).count("plugin"));
7384   string default_plugin = (*erasure_code_profile_map)["plugin"];
7385   map<string,string> user_map;
7386   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7387        i != erasure_code_profile.end();
7388        ++i) {
7389     size_t equal = i->find('=');
7390     if (equal == string::npos) {
7391       user_map[*i] = string();
7392       (*erasure_code_profile_map)[*i] = string();
7393     } else {
7394       const string key = i->substr(0, equal);
7395       equal++;
7396       const string value = i->substr(equal);
7397       if (key.find("ruleset-") == 0) {
7398         *ss << "property '" << key << "' is no longer supported; try "
7399             << "'crush-" << key.substr(8) << "' instead";
7400         return -EINVAL;
7401       }
7402       user_map[key] = value;
7403       (*erasure_code_profile_map)[key] = value;
7404     }
7405   }
7406
7407   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7408     (*erasure_code_profile_map) = user_map;
7409
7410   return 0;
7411 }
7412
7413 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7414                                   const string &erasure_code_profile,
7415                                   uint8_t repl_size,
7416                                   unsigned *size, unsigned *min_size,
7417                                   ostream *ss)
7418 {
7419   int err = 0;
7420   switch (pool_type) {
7421   case pg_pool_t::TYPE_REPLICATED:
7422     if (repl_size == 0) {
7423       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7424     }
7425     *size = repl_size;
7426     *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7427     break;
7428   case pg_pool_t::TYPE_ERASURE:
7429     {
7430       ErasureCodeInterfaceRef erasure_code;
7431       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7432       if (err == 0) {
7433         *size = erasure_code->get_chunk_count();
7434         *min_size =
7435           erasure_code->get_data_chunk_count() +
7436           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7437         assert(*min_size <= *size);
7438         assert(*min_size >= erasure_code->get_data_chunk_count());
7439       }
7440     }
7441     break;
7442   default:
7443     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7444     err = -EINVAL;
7445     break;
7446   }
7447   return err;
7448 }
7449
7450 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7451                                           const string &erasure_code_profile,
7452                                           uint32_t *stripe_width,
7453                                           ostream *ss)
7454 {
7455   int err = 0;
7456   switch (pool_type) {
7457   case pg_pool_t::TYPE_REPLICATED:
7458     // ignored
7459     break;
7460   case pg_pool_t::TYPE_ERASURE:
7461     {
7462       ErasureCodeProfile profile =
7463         osdmap.get_erasure_code_profile(erasure_code_profile);
7464       ErasureCodeInterfaceRef erasure_code;
7465       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7466       if (err)
7467         break;
7468       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7469       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7470       auto it = profile.find("stripe_unit");
7471       if (it != profile.end()) {
7472         string err_str;
7473         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7474         ceph_assert(err_str.empty());
7475       }
7476       *stripe_width = data_chunks *
7477         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7478     }
7479     break;
7480   default:
7481     *ss << "prepare_pool_stripe_width: "
7482        << pool_type << " is not a known pool type";
7483     err = -EINVAL;
7484     break;
7485   }
7486   return err;
7487 }
7488
7489 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7490                                         const string &erasure_code_profile,
7491                                         const string &rule_name,
7492                                         int *crush_rule,
7493                                         ostream *ss)
7494 {
7495
7496   if (*crush_rule < 0) {
7497     switch (pool_type) {
7498     case pg_pool_t::TYPE_REPLICATED:
7499       {
7500         if (rule_name == "") {
7501           // Use default rule
7502           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7503           if (*crush_rule < 0) {
7504             // Errors may happen e.g. if no valid rule is available
7505             *ss << "No suitable CRUSH rule exists, check "
7506                 << "'osd pool default crush *' config options";
7507             return -ENOENT;
7508           }
7509         } else {
7510           return get_crush_rule(rule_name, crush_rule, ss);
7511         }
7512       }
7513       break;
7514     case pg_pool_t::TYPE_ERASURE:
7515       {
7516         int err = crush_rule_create_erasure(rule_name,
7517                                                erasure_code_profile,
7518                                                crush_rule, ss);
7519         switch (err) {
7520         case -EALREADY:
7521           dout(20) << "prepare_pool_crush_rule: rule "
7522                    << rule_name << " try again" << dendl;
7523           // fall through
7524         case 0:
7525           // need to wait for the crush rule to be proposed before proceeding
7526           err = -EAGAIN;
7527           break;
7528         case -EEXIST:
7529           err = 0;
7530           break;
7531         }
7532         return err;
7533       }
7534       break;
7535     default:
7536       *ss << "prepare_pool_crush_rule: " << pool_type
7537          << " is not a known pool type";
7538       return -EINVAL;
7539       break;
7540     }
7541   } else {
7542     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7543       *ss << "CRUSH rule " << *crush_rule << " not found";
7544       return -ENOENT;
7545     }
7546   }
7547
7548   return 0;
7549 }
7550
7551 int OSDMonitor::get_crush_rule(const string &rule_name,
7552                                int *crush_rule,
7553                                ostream *ss)
7554 {
7555   int ret;
7556   ret = osdmap.crush->get_rule_id(rule_name);
7557   if (ret != -ENOENT) {
7558     // found it, use it
7559     *crush_rule = ret;
7560   } else {
7561     CrushWrapper newcrush;
7562     _get_pending_crush(newcrush);
7563
7564     ret = newcrush.get_rule_id(rule_name);
7565     if (ret != -ENOENT) {
7566       // found it, wait for it to be proposed
7567       dout(20) << __func__ << ": rule " << rule_name
7568                << " try again" << dendl;
7569       return -EAGAIN;
7570     } else {
7571       // Cannot find it , return error
7572       *ss << "specified rule " << rule_name << " doesn't exist";
7573       return ret;
7574     }
7575   }
7576   return 0;
7577 }
7578
7579 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7580 {
7581   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7582   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
7583   auto max_pgs = max_pgs_per_osd * num_osds;
7584   uint64_t projected = 0;
7585   if (pool < 0) {
7586     projected += pg_num * size;
7587   }
7588   for (const auto& i : osdmap.get_pools()) {
7589     if (i.first == pool) {
7590       projected += pg_num * size;
7591     } else {
7592       projected += i.second.get_pg_num_target() * i.second.get_size();
7593     }
7594   }
7595   if (projected > max_pgs) {
7596     if (pool >= 0) {
7597       *ss << "pool id " << pool;
7598     }
7599     *ss << " pg_num " << pg_num << " size " << size
7600         << " would mean " << projected
7601         << " total pgs, which exceeds max " << max_pgs
7602         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7603         << " * num_in_osds " << num_osds << ")";
7604     return -ERANGE;
7605   }
7606   return 0;
7607 }
7608
7609 /**
7610  * @param name The name of the new pool
7611  * @param crush_rule The crush rule to use. If <0, will use the system default
7612  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7613  * @param pg_num The pg_num to use. If set to 0, will use the system default
7614  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7615  * @param repl_size Replication factor, or 0 for default
7616  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7617  * @param pool_type TYPE_ERASURE, or TYPE_REP
7618  * @param expected_num_objects expected number of objects on the pool
7619  * @param fast_read fast read type.
7620  * @param ss human readable error message, if any.
7621  *
7622  * @return 0 on success, negative errno on failure.
7623  */
7624 int OSDMonitor::prepare_new_pool(string& name,
7625                                  int crush_rule,
7626                                  const string &crush_rule_name,
7627                                  unsigned pg_num, unsigned pgp_num,
7628                                  unsigned pg_num_min,
7629                                  const uint64_t repl_size,
7630                                  const uint64_t target_size_bytes,
7631                                  const float target_size_ratio,
7632                                  const string &erasure_code_profile,
7633                                  const unsigned pool_type,
7634                                  const uint64_t expected_num_objects,
7635                                  FastReadType fast_read,
7636                                  const string& pg_autoscale_mode,
7637                                  ostream *ss)
7638 {
7639   if (name.length() == 0)
7640     return -EINVAL;
7641   if (pg_num == 0)
7642     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7643   if (pgp_num == 0)
7644     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7645   if (!pgp_num)
7646     pgp_num = pg_num;
7647   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7648     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7649         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7650         << " (you may adjust 'mon max pool pg num' for higher values)";
7651     return -ERANGE;
7652   }
7653   if (pgp_num > pg_num) {
7654     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7655         << ", which in this case is " << pg_num;
7656     return -ERANGE;
7657   }
7658   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7659     *ss << "'fast_read' can only apply to erasure coding pool";
7660     return -EINVAL;
7661   }
7662   int r;
7663   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7664                                  crush_rule_name, &crush_rule, ss);
7665   if (r) {
7666     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7667     return r;
7668   }
7669   if (g_conf()->mon_osd_crush_smoke_test) {
7670     CrushWrapper newcrush;
7671     _get_pending_crush(newcrush);
7672     ostringstream err;
7673     CrushTester tester(newcrush, err);
7674     tester.set_min_x(0);
7675     tester.set_max_x(50);
7676     tester.set_rule(crush_rule);
7677     auto start = ceph::coarse_mono_clock::now();
7678     r = tester.test_with_fork(g_conf()->mon_lease);
7679     auto duration = ceph::coarse_mono_clock::now() - start;
7680     if (r < 0) {
7681       dout(10) << "tester.test_with_fork returns " << r
7682                << ": " << err.str() << dendl;
7683       *ss << "crush test failed with " << r << ": " << err.str();
7684       return r;
7685     }
7686     dout(10) << __func__ << " crush smoke test duration: "
7687              << duration << dendl;
7688   }
7689   unsigned size, min_size;
7690   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7691                         &size, &min_size, ss);
7692   if (r) {
7693     dout(10) << "prepare_pool_size returns " << r << dendl;
7694     return r;
7695   }
7696   r = check_pg_num(-1, pg_num, size, ss);
7697   if (r) {
7698     dout(10) << "check_pg_num returns " << r << dendl;
7699     return r;
7700   }
7701
7702   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7703     return -EINVAL;
7704   }
7705
7706   uint32_t stripe_width = 0;
7707   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7708   if (r) {
7709     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7710     return r;
7711   }
7712
7713   bool fread = false;
7714   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7715     switch (fast_read) {
7716       case FAST_READ_OFF:
7717         fread = false;
7718         break;
7719       case FAST_READ_ON:
7720         fread = true;
7721         break;
7722       case FAST_READ_DEFAULT:
7723         fread = g_conf()->osd_pool_default_ec_fast_read;
7724         break;
7725       default:
7726         *ss << "invalid fast_read setting: " << fast_read;
7727         return -EINVAL;
7728     }
7729   }
7730
7731   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7732        p != pending_inc.new_pool_names.end();
7733        ++p) {
7734     if (p->second == name)
7735       return 0;
7736   }
7737
7738   if (-1 == pending_inc.new_pool_max)
7739     pending_inc.new_pool_max = osdmap.pool_max;
7740   int64_t pool = ++pending_inc.new_pool_max;
7741   pg_pool_t empty;
7742   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7743   pi->create_time = ceph_clock_now();
7744   pi->type = pool_type;
7745   pi->fast_read = fread;
7746   pi->flags = g_conf()->osd_pool_default_flags;
7747   if (g_conf()->osd_pool_default_flag_hashpspool)
7748     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7749   if (g_conf()->osd_pool_default_flag_nodelete)
7750     pi->set_flag(pg_pool_t::FLAG_NODELETE);
7751   if (g_conf()->osd_pool_default_flag_nopgchange)
7752     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7753   if (g_conf()->osd_pool_default_flag_nosizechange)
7754     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7755   pi->set_flag(pg_pool_t::FLAG_CREATING);
7756   if (g_conf()->osd_pool_use_gmt_hitset)
7757     pi->use_gmt_hitset = true;
7758   else
7759     pi->use_gmt_hitset = false;
7760
7761   pi->size = size;
7762   pi->min_size = min_size;
7763   pi->crush_rule = crush_rule;
7764   pi->expected_num_objects = expected_num_objects;
7765   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7766
7767   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7768         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7769       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7770     pi->pg_autoscale_mode = m;
7771   } else {
7772     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7773   }
7774   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7775   pi->set_pg_num(
7776     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
7777     : pg_num);
7778   pi->set_pg_num_pending(pi->get_pg_num());
7779   pi->set_pg_num_target(pg_num);
7780   pi->set_pgp_num(pi->get_pg_num());
7781   pi->set_pgp_num_target(pgp_num);
7782   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7783       pg_num_min) {
7784     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
7785   }
7786   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7787         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7788     pi->pg_autoscale_mode = m;
7789   }
7790
7791   pi->last_change = pending_inc.epoch;
7792   pi->auid = 0;
7793
7794   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7795       pi->erasure_code_profile = erasure_code_profile;
7796   } else {
7797       pi->erasure_code_profile = "";
7798   }
7799   pi->stripe_width = stripe_width;
7800
7801   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
7802       target_size_bytes) {
7803     // only store for nautilus+ because TARGET_SIZE_BYTES may be
7804     // larger than int32_t max.
7805     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
7806   }
7807   if (target_size_ratio > 0.0 &&
7808       osdmap.require_osd_release >= ceph_release_t::nautilus) {
7809     // only store for nautilus+, just to be consistent and tidy.
7810     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
7811   }
7812
7813   pi->cache_target_dirty_ratio_micro =
7814     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
7815   pi->cache_target_dirty_high_ratio_micro =
7816     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
7817   pi->cache_target_full_ratio_micro =
7818     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
7819   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
7820   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
7821
7822   pending_inc.new_pool_names[pool] = name;
7823   return 0;
7824 }
7825
7826 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
7827 {
7828   op->mark_osdmon_event(__func__);
7829   ostringstream ss;
7830   if (pending_inc.new_flags < 0)
7831     pending_inc.new_flags = osdmap.get_flags();
7832   pending_inc.new_flags |= flag;
7833   ss << OSDMap::get_flag_string(flag) << " is set";
7834   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7835                                                     get_last_committed() + 1));
7836   return true;
7837 }
7838
7839 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
7840 {
7841   op->mark_osdmon_event(__func__);
7842   ostringstream ss;
7843   if (pending_inc.new_flags < 0)
7844     pending_inc.new_flags = osdmap.get_flags();
7845   pending_inc.new_flags &= ~flag;
7846   ss << OSDMap::get_flag_string(flag) << " is unset";
7847   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
7848                                                     get_last_committed() + 1));
7849   return true;
7850 }
7851
7852 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
7853                                          stringstream& ss)
7854 {
7855   string poolstr;
7856   cmd_getval(cmdmap, "pool", poolstr);
7857   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
7858   if (pool < 0) {
7859     ss << "unrecognized pool '" << poolstr << "'";
7860     return -ENOENT;
7861   }
7862   string var;
7863   cmd_getval(cmdmap, "var", var);
7864
7865   pg_pool_t p = *osdmap.get_pg_pool(pool);
7866   if (pending_inc.new_pools.count(pool))
7867     p = pending_inc.new_pools[pool];
7868
7869   // accept val as a json string in the normal case (current
7870   // generation monitor).  parse out int or float values from the
7871   // string as needed.  however, if it is not a string, try to pull
7872   // out an int, in case an older monitor with an older json schema is
7873   // forwarding a request.
7874   string val;
7875   string interr, floaterr;
7876   int64_t n = 0;
7877   double f = 0;
7878   int64_t uf = 0;  // micro-f
7879   cmd_getval(cmdmap, "val", val);
7880
7881   auto si_options = {
7882     "target_max_objects"
7883   };
7884   auto iec_options = {
7885     "target_max_bytes",
7886     "target_size_bytes",
7887     "compression_max_blob_size",
7888     "compression_min_blob_size",
7889     "csum_max_block",
7890     "csum_min_block",
7891   };
7892   if (count(begin(si_options), end(si_options), var)) {
7893     n = strict_si_cast<int64_t>(val.c_str(), &interr);
7894   } else if (count(begin(iec_options), end(iec_options), var)) {
7895     n = strict_iec_cast<int64_t>(val.c_str(), &interr);
7896   } else {
7897     // parse string as both int and float; different fields use different types.
7898     n = strict_strtoll(val.c_str(), 10, &interr);
7899     f = strict_strtod(val.c_str(), &floaterr);
7900     uf = llrintl(f * (double)1000000.0);
7901   }
7902
7903   if (!p.is_tier() &&
7904       (var == "hit_set_type" || var == "hit_set_period" ||
7905        var == "hit_set_count" || var == "hit_set_fpp" ||
7906        var == "target_max_objects" || var == "target_max_bytes" ||
7907        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
7908        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
7909        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
7910        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
7911        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
7912     return -EACCES;
7913   }
7914
7915   if (var == "size") {
7916     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7917       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
7918       return -EPERM;
7919     }
7920     if (p.type == pg_pool_t::TYPE_ERASURE) {
7921       ss << "can not change the size of an erasure-coded pool";
7922       return -ENOTSUP;
7923     }
7924     if (interr.length()) {
7925       ss << "error parsing integer value '" << val << "': " << interr;
7926       return -EINVAL;
7927     }
7928     if (n <= 0 || n > 10) {
7929       ss << "pool size must be between 1 and 10";
7930       return -EINVAL;
7931     }
7932     if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
7933       return -EINVAL;
7934     }
7935     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
7936     if (r < 0) {
7937       return r;
7938     }
7939     p.size = n;
7940     if (n < p.min_size)
7941       p.min_size = n;
7942   } else if (var == "min_size") {
7943     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
7944       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
7945       return -EPERM;
7946     }
7947     if (interr.length()) {
7948       ss << "error parsing integer value '" << val << "': " << interr;
7949       return -EINVAL;
7950     }
7951
7952     if (p.type != pg_pool_t::TYPE_ERASURE) {
7953       if (n < 1 || n > p.size) {
7954         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
7955         return -EINVAL;
7956       }
7957     } else {
7958        ErasureCodeInterfaceRef erasure_code;
7959        int k;
7960        stringstream tmp;
7961        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
7962        if (err == 0) {
7963          k = erasure_code->get_data_chunk_count();
7964        } else {
7965          ss << __func__ << " get_erasure_code failed: " << tmp.str();
7966          return err;
7967        }
7968
7969        if (n < k || n > p.size) {
7970          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
7971          return -EINVAL;
7972        }
7973     }
7974     p.min_size = n;
7975   } else if (var == "pg_num_actual") {
7976     if (interr.length()) {
7977       ss << "error parsing integer value '" << val << "': " << interr;
7978       return -EINVAL;
7979     }
7980     if (n == (int)p.get_pg_num()) {
7981       return 0;
7982     }
7983     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7984       ss << "'pg_num' must be greater than 0 and less than or equal to "
7985          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7986          << " (you may adjust 'mon max pool pg num' for higher values)";
7987       return -ERANGE;
7988     }
7989     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
7990       ss << "cannot adjust pg_num while initial PGs are being created";
7991       return -EBUSY;
7992     }
7993     if (n > (int)p.get_pg_num()) {
7994       if (p.get_pg_num() != p.get_pg_num_pending()) {
7995         // force pre-nautilus clients to resend their ops, since they
7996         // don't understand pg_num_pending changes form a new interval
7997         p.last_force_op_resend_prenautilus = pending_inc.epoch;
7998       }
7999       p.set_pg_num(n);
8000     } else {
8001       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8002         ss << "nautilus OSDs are required to adjust pg_num_pending";
8003         return -EPERM;
8004       }
8005       if (n < (int)p.get_pgp_num()) {
8006         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8007         return -EINVAL;
8008       }
8009       if (n < (int)p.get_pg_num() - 1) {
8010         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8011            << ") - 1; only single pg decrease is currently supported";
8012         return -EINVAL;
8013       }
8014       p.set_pg_num_pending(n);
8015       // force pre-nautilus clients to resend their ops, since they
8016       // don't understand pg_num_pending changes form a new interval
8017       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8018     }
8019     // force pre-luminous clients to resend their ops, since they
8020     // don't understand that split PGs now form a new interval.
8021     p.last_force_op_resend_preluminous = pending_inc.epoch;
8022   } else if (var == "pg_num") {
8023     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8024       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8025       return -EPERM;
8026     }
8027     if (interr.length()) {
8028       ss << "error parsing integer value '" << val << "': " << interr;
8029       return -EINVAL;
8030     }
8031     if (n == (int)p.get_pg_num_target()) {
8032       return 0;
8033     }
8034     if (n <= 0 || static_cast<uint64_t>(n) >
8035                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8036       ss << "'pg_num' must be greater than 0 and less than or equal to "
8037          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8038          << " (you may adjust 'mon max pool pg num' for higher values)";
8039       return -ERANGE;
8040     }
8041     if (n > (int)p.get_pg_num_target()) {
8042       int r = check_pg_num(pool, n, p.get_size(), &ss);
8043       if (r) {
8044         return r;
8045       }
8046       bool force = false;
8047       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8048       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8049         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8050         return -EPERM;
8051       }
8052     } else {
8053       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8054         ss << "nautilus OSDs are required to decrease pg_num";
8055         return -EPERM;
8056       }
8057     }
8058     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8059       // pre-nautilus osdmap format; increase pg_num directly
8060       assert(n > (int)p.get_pg_num());
8061       // force pre-nautilus clients to resend their ops, since they
8062       // don't understand pg_num_target changes form a new interval
8063       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8064       // force pre-luminous clients to resend their ops, since they
8065       // don't understand that split PGs now form a new interval.
8066       p.last_force_op_resend_preluminous = pending_inc.epoch;
8067       p.set_pg_num(n);
8068     } else {
8069       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8070       // make pgp_num track pg_num if it already matches.  if it is set
8071       // differently, leave it different and let the user control it
8072       // manually.
8073       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8074         p.set_pgp_num_target(n);
8075       }
8076       p.set_pg_num_target(n);
8077     }
8078   } else if (var == "pgp_num_actual") {
8079     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8080       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8081       return -EPERM;
8082     }
8083     if (interr.length()) {
8084       ss << "error parsing integer value '" << val << "': " << interr;
8085       return -EINVAL;
8086     }
8087     if (n <= 0) {
8088       ss << "specified pgp_num must > 0, but you set to " << n;
8089       return -EINVAL;
8090     }
8091     if (n > (int)p.get_pg_num()) {
8092       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8093       return -EINVAL;
8094     }
8095     if (n > (int)p.get_pg_num_pending()) {
8096       ss << "specified pgp_num " << n
8097          << " > pg_num_pending " << p.get_pg_num_pending();
8098       return -EINVAL;
8099     }
8100     p.set_pgp_num(n);
8101   } else if (var == "pgp_num") {
8102     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8103       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8104       return -EPERM;
8105     }
8106     if (interr.length()) {
8107       ss << "error parsing integer value '" << val << "': " << interr;
8108       return -EINVAL;
8109     }
8110     if (n <= 0) {
8111       ss << "specified pgp_num must > 0, but you set to " << n;
8112       return -EINVAL;
8113     }
8114     if (n > (int)p.get_pg_num_target()) {
8115       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8116       return -EINVAL;
8117     }
8118     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8119       // pre-nautilus osdmap format; increase pgp_num directly
8120       p.set_pgp_num(n);
8121     } else {
8122       p.set_pgp_num_target(n);
8123     }
8124   } else if (var == "pg_autoscale_mode") {
8125     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8126     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8127       ss << "specified invalid mode " << val;
8128       return -EINVAL;
8129     }
8130     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8131       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8132       return -EINVAL;
8133     }
8134     p.pg_autoscale_mode = m;
8135   } else if (var == "crush_rule") {
8136     int id = osdmap.crush->get_rule_id(val);
8137     if (id == -ENOENT) {
8138       ss << "crush rule " << val << " does not exist";
8139       return -ENOENT;
8140     }
8141     if (id < 0) {
8142       ss << cpp_strerror(id);
8143       return -ENOENT;
8144     }
8145     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8146       return -EINVAL;
8147     }
8148     p.crush_rule = id;
8149   } else if (var == "nodelete" || var == "nopgchange" ||
8150              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8151              var == "noscrub" || var == "nodeep-scrub") {
8152     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8153     // make sure we only compare against 'n' if we didn't receive a string
8154     if (val == "true" || (interr.empty() && n == 1)) {
8155       p.set_flag(flag);
8156     } else if (val == "false" || (interr.empty() && n == 0)) {
8157       p.unset_flag(flag);
8158     } else {
8159       ss << "expecting value 'true', 'false', '0', or '1'";
8160       return -EINVAL;
8161     }
8162   } else if (var == "hashpspool") {
8163     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8164     bool force = false;
8165     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8166
8167     if (!force) {
8168       ss << "are you SURE?  this will remap all placement groups in this pool,"
8169             " this triggers large data movement,"
8170             " pass --yes-i-really-mean-it if you really do.";
8171       return -EPERM;
8172     }
8173     // make sure we only compare against 'n' if we didn't receive a string
8174     if (val == "true" || (interr.empty() && n == 1)) {
8175       p.set_flag(flag);
8176     } else if (val == "false" || (interr.empty() && n == 0)) {
8177       p.unset_flag(flag);
8178     } else {
8179       ss << "expecting value 'true', 'false', '0', or '1'";
8180       return -EINVAL;
8181     }
8182   } else if (var == "hit_set_type") {
8183     if (val == "none")
8184       p.hit_set_params = HitSet::Params();
8185     else {
8186       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8187       if (err)
8188         return err;
8189       if (val == "bloom") {
8190         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8191         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8192         p.hit_set_params = HitSet::Params(bsp);
8193       } else if (val == "explicit_hash")
8194         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8195       else if (val == "explicit_object")
8196         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8197       else {
8198         ss << "unrecognized hit_set type '" << val << "'";
8199         return -EINVAL;
8200       }
8201     }
8202   } else if (var == "hit_set_period") {
8203     if (interr.length()) {
8204       ss << "error parsing integer value '" << val << "': " << interr;
8205       return -EINVAL;
8206     } else if (n < 0) {
8207       ss << "hit_set_period should be non-negative";
8208       return -EINVAL;
8209     }
8210     p.hit_set_period = n;
8211   } else if (var == "hit_set_count") {
8212     if (interr.length()) {
8213       ss << "error parsing integer value '" << val << "': " << interr;
8214       return -EINVAL;
8215     } else if (n < 0) {
8216       ss << "hit_set_count should be non-negative";
8217       return -EINVAL;
8218     }
8219     p.hit_set_count = n;
8220   } else if (var == "hit_set_fpp") {
8221     if (floaterr.length()) {
8222       ss << "error parsing floating point value '" << val << "': " << floaterr;
8223       return -EINVAL;
8224     } else if (f < 0 || f > 1.0) {
8225       ss << "hit_set_fpp should be in the range 0..1";
8226       return -EINVAL;
8227     }
8228     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8229       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8230       return -EINVAL;
8231     }
8232     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8233     bloomp->set_fpp(f);
8234   } else if (var == "use_gmt_hitset") {
8235     if (val == "true" || (interr.empty() && n == 1)) {
8236       p.use_gmt_hitset = true;
8237     } else {
8238       ss << "expecting value 'true' or '1'";
8239       return -EINVAL;
8240     }
8241   } else if (var == "allow_ec_overwrites") {
8242     if (!p.is_erasure()) {
8243       ss << "ec overwrites can only be enabled for an erasure coded pool";
8244       return -EINVAL;
8245     }
8246     stringstream err;
8247     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8248         !is_pool_currently_all_bluestore(pool, p, &err)) {
8249       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8250       return -EINVAL;
8251     }
8252     if (val == "true" || (interr.empty() && n == 1)) {
8253         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8254     } else if (val == "false" || (interr.empty() && n == 0)) {
8255       ss << "ec overwrites cannot be disabled once enabled";
8256       return -EINVAL;
8257     } else {
8258       ss << "expecting value 'true', 'false', '0', or '1'";
8259       return -EINVAL;
8260     }
8261   } else if (var == "target_max_objects") {
8262     if (interr.length()) {
8263       ss << "error parsing int '" << val << "': " << interr;
8264       return -EINVAL;
8265     }
8266     p.target_max_objects = n;
8267   } else if (var == "target_max_bytes") {
8268     if (interr.length()) {
8269       ss << "error parsing int '" << val << "': " << interr;
8270       return -EINVAL;
8271     }
8272     p.target_max_bytes = n;
8273   } else if (var == "cache_target_dirty_ratio") {
8274     if (floaterr.length()) {
8275       ss << "error parsing float '" << val << "': " << floaterr;
8276       return -EINVAL;
8277     }
8278     if (f < 0 || f > 1.0) {
8279       ss << "value must be in the range 0..1";
8280       return -ERANGE;
8281     }
8282     p.cache_target_dirty_ratio_micro = uf;
8283   } else if (var == "cache_target_dirty_high_ratio") {
8284     if (floaterr.length()) {
8285       ss << "error parsing float '" << val << "': " << floaterr;
8286       return -EINVAL;
8287     }
8288     if (f < 0 || f > 1.0) {
8289       ss << "value must be in the range 0..1";
8290       return -ERANGE;
8291     }
8292     p.cache_target_dirty_high_ratio_micro = uf;
8293   } else if (var == "cache_target_full_ratio") {
8294     if (floaterr.length()) {
8295       ss << "error parsing float '" << val << "': " << floaterr;
8296       return -EINVAL;
8297     }
8298     if (f < 0 || f > 1.0) {
8299       ss << "value must be in the range 0..1";
8300       return -ERANGE;
8301     }
8302     p.cache_target_full_ratio_micro = uf;
8303   } else if (var == "cache_min_flush_age") {
8304     if (interr.length()) {
8305       ss << "error parsing int '" << val << "': " << interr;
8306       return -EINVAL;
8307     }
8308     p.cache_min_flush_age = n;
8309   } else if (var == "cache_min_evict_age") {
8310     if (interr.length()) {
8311       ss << "error parsing int '" << val << "': " << interr;
8312       return -EINVAL;
8313     }
8314     p.cache_min_evict_age = n;
8315   } else if (var == "min_read_recency_for_promote") {
8316     if (interr.length()) {
8317       ss << "error parsing integer value '" << val << "': " << interr;
8318       return -EINVAL;
8319     }
8320     p.min_read_recency_for_promote = n;
8321   } else if (var == "hit_set_grade_decay_rate") {
8322     if (interr.length()) {
8323       ss << "error parsing integer value '" << val << "': " << interr;
8324       return -EINVAL;
8325     }
8326     if (n > 100 || n < 0) {
8327       ss << "value out of range,valid range is 0 - 100";
8328       return -EINVAL;
8329     }
8330     p.hit_set_grade_decay_rate = n;
8331   } else if (var == "hit_set_search_last_n") {
8332     if (interr.length()) {
8333       ss << "error parsing integer value '" << val << "': " << interr;
8334       return -EINVAL;
8335     }
8336     if (n > p.hit_set_count || n < 0) {
8337       ss << "value out of range,valid range is 0 - hit_set_count";
8338       return -EINVAL;
8339     }
8340     p.hit_set_search_last_n = n;
8341   } else if (var == "min_write_recency_for_promote") {
8342     if (interr.length()) {
8343       ss << "error parsing integer value '" << val << "': " << interr;
8344       return -EINVAL;
8345     }
8346     p.min_write_recency_for_promote = n;
8347   } else if (var == "fast_read") {
8348     if (p.is_replicated()) {
8349         ss << "fast read is not supported in replication pool";
8350         return -EINVAL;
8351     }
8352     if (val == "true" || (interr.empty() && n == 1)) {
8353       p.fast_read = true;
8354     } else if (val == "false" || (interr.empty() && n == 0)) {
8355       p.fast_read = false;
8356     } else {
8357       ss << "expecting value 'true', 'false', '0', or '1'";
8358       return -EINVAL;
8359     }
8360   } else if (pool_opts_t::is_opt_name(var)) {
8361     bool unset = val == "unset";
8362     if (var == "compression_mode") {
8363       if (!unset) {
8364         auto cmode = Compressor::get_comp_mode_type(val);
8365         if (!cmode) {
8366           ss << "unrecognized compression mode '" << val << "'";
8367           return -EINVAL;
8368         }
8369       }
8370     } else if (var == "compression_algorithm") {
8371       if (!unset) {
8372         auto alg = Compressor::get_comp_alg_type(val);
8373         if (!alg) {
8374           ss << "unrecognized compression_algorithm '" << val << "'";
8375           return -EINVAL;
8376         }
8377       }
8378     } else if (var == "compression_required_ratio") {
8379       if (floaterr.length()) {
8380         ss << "error parsing float value '" << val << "': " << floaterr;
8381         return -EINVAL;
8382       }
8383       if (f < 0 || f > 1) {
8384         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8385         return -EINVAL;
8386       }
8387     } else if (var == "csum_type") {
8388       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8389       if (t < 0 ) {
8390         ss << "unrecognized csum_type '" << val << "'";
8391         return -EINVAL;
8392       }
8393       //preserve csum_type numeric value
8394       n = t;
8395       interr.clear();
8396     } else if (var == "compression_max_blob_size" ||
8397                var == "compression_min_blob_size" ||
8398                var == "csum_max_block" ||
8399                var == "csum_min_block") {
8400       if (interr.length()) {
8401         ss << "error parsing int value '" << val << "': " << interr;
8402         return -EINVAL;
8403       }
8404     } else if (var == "fingerprint_algorithm") {
8405       if (!unset) {
8406         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8407         if (!alg) {
8408           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8409           return -EINVAL;
8410         }
8411       }
8412     } else if (var == "target_size_bytes") {
8413       if (interr.length()) {
8414         ss << "error parsing unit value '" << val << "': " << interr;
8415         return -EINVAL;
8416       }
8417       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8418         ss << "must set require_osd_release to nautilus or "
8419            << "later before setting target_size_bytes";
8420         return -EINVAL;
8421       }
8422     } else if (var == "pg_num_min") {
8423       if (interr.length()) {
8424         ss << "error parsing int value '" << val << "': " << interr;
8425         return -EINVAL;
8426       }
8427       if (n > (int)p.get_pg_num_target()) {
8428         ss << "specified pg_num_min " << n
8429            << " > pg_num " << p.get_pg_num_target();
8430         return -EINVAL;
8431       }
8432     } else if (var == "recovery_priority") {
8433       if (interr.length()) {
8434         ss << "error parsing int value '" << val << "': " << interr;
8435         return -EINVAL;
8436       }
8437       if (!g_conf()->debug_allow_any_pool_priority) {
8438         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8439           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8440              << " and " << OSD_POOL_PRIORITY_MAX;
8441           return -EINVAL;
8442         }
8443       }
8444     } else if (var == "pg_autoscale_bias") {
8445       if (f < 0.0 || f > 1000.0) {
8446         ss << "pg_autoscale_bias must be between 0 and 1000";
8447         return -EINVAL;
8448       }
8449     }
8450
8451     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8452     switch (desc.type) {
8453     case pool_opts_t::STR:
8454       if (unset) {
8455         p.opts.unset(desc.key);
8456       } else {
8457         p.opts.set(desc.key, static_cast<std::string>(val));
8458       }
8459       break;
8460     case pool_opts_t::INT:
8461       if (interr.length()) {
8462         ss << "error parsing integer value '" << val << "': " << interr;
8463         return -EINVAL;
8464       }
8465       if (n == 0) {
8466         p.opts.unset(desc.key);
8467       } else {
8468         p.opts.set(desc.key, static_cast<int64_t>(n));
8469       }
8470       break;
8471     case pool_opts_t::DOUBLE:
8472       if (floaterr.length()) {
8473         ss << "error parsing floating point value '" << val << "': " << floaterr;
8474         return -EINVAL;
8475       }
8476       if (f == 0) {
8477         p.opts.unset(desc.key);
8478       } else {
8479         p.opts.set(desc.key, static_cast<double>(f));
8480       }
8481       break;
8482     default:
8483       ceph_assert(!"unknown type");
8484     }
8485   } else {
8486     ss << "unrecognized variable '" << var << "'";
8487     return -EINVAL;
8488   }
8489   if (val != "unset") {
8490     ss << "set pool " << pool << " " << var << " to " << val;
8491   } else {
8492     ss << "unset pool " << pool << " " << var;
8493   }
8494   p.last_change = pending_inc.epoch;
8495   pending_inc.new_pools[pool] = p;
8496   return 0;
8497 }
8498
8499 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8500                                                  const cmdmap_t& cmdmap,
8501                                                  stringstream& ss)
8502 {
8503   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8504 }
8505
8506 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8507                                                     const cmdmap_t& cmdmap,
8508                                                     stringstream& ss,
8509                                                     bool *modified)
8510 {
8511   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8512 }
8513
8514
8515 /**
8516  * Common logic for preprocess and prepare phases of pool application
8517  * tag commands.  In preprocess mode we're only detecting invalid
8518  * commands, and determining whether it was a modification or a no-op.
8519  * In prepare mode we're actually updating the pending state.
8520  */
8521 int OSDMonitor::_command_pool_application(const string &prefix,
8522                                           const cmdmap_t& cmdmap,
8523                                           stringstream& ss,
8524                                           bool *modified,
8525                                           bool preparing)
8526 {
8527   string pool_name;
8528   cmd_getval(cmdmap, "pool", pool_name);
8529   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8530   if (pool < 0) {
8531     ss << "unrecognized pool '" << pool_name << "'";
8532     return -ENOENT;
8533   }
8534
8535   pg_pool_t p = *osdmap.get_pg_pool(pool);
8536   if (preparing) {
8537     if (pending_inc.new_pools.count(pool)) {
8538       p = pending_inc.new_pools[pool];
8539     }
8540   }
8541
8542   string app;
8543   cmd_getval(cmdmap, "app", app);
8544   bool app_exists = (p.application_metadata.count(app) > 0);
8545
8546   string key;
8547   cmd_getval(cmdmap, "key", key);
8548   if (key == "all") {
8549     ss << "key cannot be 'all'";
8550     return -EINVAL;
8551   }
8552
8553   string value;
8554   cmd_getval(cmdmap, "value", value);
8555   if (value == "all") {
8556     ss << "value cannot be 'all'";
8557     return -EINVAL;
8558   }
8559
8560   if (boost::algorithm::ends_with(prefix, "enable")) {
8561     if (app.empty()) {
8562       ss << "application name must be provided";
8563       return -EINVAL;
8564     }
8565
8566     if (p.is_tier()) {
8567       ss << "application must be enabled on base tier";
8568       return -EINVAL;
8569     }
8570
8571     bool force = false;
8572     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8573
8574     if (!app_exists && !p.application_metadata.empty() && !force) {
8575       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8576          << "application; pass --yes-i-really-mean-it to proceed anyway";
8577       return -EPERM;
8578     }
8579
8580     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8581       ss << "too many enabled applications on pool '" << pool_name << "'; "
8582          << "max " << MAX_POOL_APPLICATIONS;
8583       return -EINVAL;
8584     }
8585
8586     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8587       ss << "application name '" << app << "' too long; max length "
8588          << MAX_POOL_APPLICATION_LENGTH;
8589       return -EINVAL;
8590     }
8591
8592     if (!app_exists) {
8593       p.application_metadata[app] = {};
8594     }
8595     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8596
8597   } else if (boost::algorithm::ends_with(prefix, "disable")) {
8598     bool force = false;
8599     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8600
8601     if (!force) {
8602       ss << "Are you SURE? Disabling an application within a pool might result "
8603          << "in loss of application functionality; pass "
8604          << "--yes-i-really-mean-it to proceed anyway";
8605       return -EPERM;
8606     }
8607
8608     if (!app_exists) {
8609       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8610          << "'";
8611       return 0; // idempotent
8612     }
8613
8614     p.application_metadata.erase(app);
8615     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8616
8617   } else if (boost::algorithm::ends_with(prefix, "set")) {
8618     if (p.is_tier()) {
8619       ss << "application metadata must be set on base tier";
8620       return -EINVAL;
8621     }
8622
8623     if (!app_exists) {
8624       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8625          << "'";
8626       return -ENOENT;
8627     }
8628
8629     string key;
8630     cmd_getval(cmdmap, "key", key);
8631
8632     if (key.empty()) {
8633       ss << "key must be provided";
8634       return -EINVAL;
8635     }
8636
8637     auto &app_keys = p.application_metadata[app];
8638     if (app_keys.count(key) == 0 &&
8639         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8640       ss << "too many keys set for application '" << app << "' on pool '"
8641          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8642       return -EINVAL;
8643     }
8644
8645     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8646       ss << "key '" << app << "' too long; max length "
8647          << MAX_POOL_APPLICATION_LENGTH;
8648       return -EINVAL;
8649     }
8650
8651     string value;
8652     cmd_getval(cmdmap, "value", value);
8653     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8654       ss << "value '" << value << "' too long; max length "
8655          << MAX_POOL_APPLICATION_LENGTH;
8656       return -EINVAL;
8657     }
8658
8659     p.application_metadata[app][key] = value;
8660     ss << "set application '" << app << "' key '" << key << "' to '"
8661        << value << "' on pool '" << pool_name << "'";
8662   } else if (boost::algorithm::ends_with(prefix, "rm")) {
8663     if (!app_exists) {
8664       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8665          << "'";
8666       return -ENOENT;
8667     }
8668
8669     string key;
8670     cmd_getval(cmdmap, "key", key);
8671     auto it = p.application_metadata[app].find(key);
8672     if (it == p.application_metadata[app].end()) {
8673       ss << "application '" << app << "' on pool '" << pool_name
8674          << "' does not have key '" << key << "'";
8675       return 0; // idempotent
8676     }
8677
8678     p.application_metadata[app].erase(it);
8679     ss << "removed application '" << app << "' key '" << key << "' on pool '"
8680        << pool_name << "'";
8681   } else {
8682     ceph_abort();
8683   }
8684
8685   if (preparing) {
8686     p.last_change = pending_inc.epoch;
8687     pending_inc.new_pools[pool] = p;
8688   }
8689
8690   // Because we fell through this far, we didn't hit no-op cases,
8691   // so pool was definitely modified
8692   if (modified != nullptr) {
8693     *modified = true;
8694   }
8695
8696   return 0;
8697 }
8698
8699 int OSDMonitor::_prepare_command_osd_crush_remove(
8700     CrushWrapper &newcrush,
8701     int32_t id,
8702     int32_t ancestor,
8703     bool has_ancestor,
8704     bool unlink_only)
8705 {
8706   int err = 0;
8707
8708   if (has_ancestor) {
8709     err = newcrush.remove_item_under(cct, id, ancestor,
8710         unlink_only);
8711   } else {
8712     err = newcrush.remove_item(cct, id, unlink_only);
8713   }
8714   return err;
8715 }
8716
8717 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8718 {
8719   pending_inc.crush.clear();
8720   newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8721 }
8722
8723 int OSDMonitor::prepare_command_osd_crush_remove(
8724     CrushWrapper &newcrush,
8725     int32_t id,
8726     int32_t ancestor,
8727     bool has_ancestor,
8728     bool unlink_only)
8729 {
8730   int err = _prepare_command_osd_crush_remove(
8731       newcrush, id, ancestor,
8732       has_ancestor, unlink_only);
8733
8734   if (err < 0)
8735     return err;
8736
8737   ceph_assert(err == 0);
8738   do_osd_crush_remove(newcrush);
8739
8740   return 0;
8741 }
8742
8743 int OSDMonitor::prepare_command_osd_remove(int32_t id)
8744 {
8745   if (osdmap.is_up(id)) {
8746     return -EBUSY;
8747   }
8748
8749   pending_inc.new_state[id] = osdmap.get_state(id);
8750   pending_inc.new_uuid[id] = uuid_d();
8751   pending_metadata_rm.insert(id);
8752   pending_metadata.erase(id);
8753
8754   return 0;
8755 }
8756
8757 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
8758 {
8759   ceph_assert(existing_id);
8760   *existing_id = -1;
8761
8762   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
8763     if (!osdmap.exists(i) &&
8764         pending_inc.new_up_client.count(i) == 0 &&
8765         (pending_inc.new_state.count(i) == 0 ||
8766          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
8767       *existing_id = i;
8768       return -1;
8769     }
8770   }
8771
8772   if (pending_inc.new_max_osd < 0) {
8773     return osdmap.get_max_osd();
8774   }
8775   return pending_inc.new_max_osd;
8776 }
8777
8778 void OSDMonitor::do_osd_create(
8779     const int32_t id,
8780     const uuid_d& uuid,
8781     const string& device_class,
8782     int32_t* new_id)
8783 {
8784   dout(10) << __func__ << " uuid " << uuid << dendl;
8785   ceph_assert(new_id);
8786
8787   // We presume validation has been performed prior to calling this
8788   // function. We assert with prejudice.
8789
8790   int32_t allocated_id = -1; // declare here so we can jump
8791   int32_t existing_id = -1;
8792   if (!uuid.is_zero()) {
8793     existing_id = osdmap.identify_osd(uuid);
8794     if (existing_id >= 0) {
8795       ceph_assert(id < 0 || id == existing_id);
8796       *new_id = existing_id;
8797       goto out;
8798     } else if (id >= 0) {
8799       // uuid does not exist, and id has been provided, so just create
8800       // the new osd.id
8801       *new_id = id;
8802       goto out;
8803     }
8804   }
8805
8806   // allocate a new id
8807   allocated_id = _allocate_osd_id(&existing_id);
8808   dout(10) << __func__ << " allocated id " << allocated_id
8809            << " existing id " << existing_id << dendl;
8810   if (existing_id >= 0) {
8811     ceph_assert(existing_id < osdmap.get_max_osd());
8812     ceph_assert(allocated_id < 0);
8813     pending_inc.new_weight[existing_id] = CEPH_OSD_OUT;
8814     *new_id = existing_id;
8815   } else if (allocated_id >= 0) {
8816     ceph_assert(existing_id < 0);
8817     // raise max_osd
8818     if (pending_inc.new_max_osd < 0) {
8819       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
8820     } else {
8821       ++pending_inc.new_max_osd;
8822     }
8823     *new_id = pending_inc.new_max_osd - 1;
8824     ceph_assert(*new_id == allocated_id);
8825   } else {
8826     ceph_abort_msg("unexpected condition");
8827   }
8828
8829 out:
8830   if (device_class.size()) {
8831     CrushWrapper newcrush;
8832     _get_pending_crush(newcrush);
8833     if (newcrush.get_max_devices() < *new_id + 1) {
8834       newcrush.set_max_devices(*new_id + 1);
8835     }
8836     string name = string("osd.") + stringify(*new_id);
8837     if (!newcrush.item_exists(*new_id)) {
8838       newcrush.set_item_name(*new_id, name);
8839     }
8840     ostringstream ss;
8841     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
8842     if (r < 0) {
8843       derr << __func__ << " failed to set " << name << " device_class "
8844            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
8845            << dendl;
8846       // non-fatal... this might be a replay and we want to be idempotent.
8847     } else {
8848       dout(20) << __func__ << " set " << name << " device_class " << device_class
8849                << dendl;
8850       pending_inc.crush.clear();
8851       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
8852     }
8853   } else {
8854     dout(20) << __func__ << " no device_class" << dendl;
8855   }
8856
8857   dout(10) << __func__ << " using id " << *new_id << dendl;
8858   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
8859     pending_inc.new_max_osd = *new_id + 1;
8860   }
8861
8862   pending_inc.new_state[*new_id] |= CEPH_OSD_EXISTS | CEPH_OSD_NEW;
8863   if (!uuid.is_zero())
8864     pending_inc.new_uuid[*new_id] = uuid;
8865 }
8866
8867 int OSDMonitor::validate_osd_create(
8868     const int32_t id,
8869     const uuid_d& uuid,
8870     const bool check_osd_exists,
8871     int32_t* existing_id,
8872     stringstream& ss)
8873 {
8874
8875   dout(10) << __func__ << " id " << id << " uuid " << uuid
8876            << " check_osd_exists " << check_osd_exists << dendl;
8877
8878   ceph_assert(existing_id);
8879
8880   if (id < 0 && uuid.is_zero()) {
8881     // we have nothing to validate
8882     *existing_id = -1;
8883     return 0;
8884   } else if (uuid.is_zero()) {
8885     // we have an id but we will ignore it - because that's what
8886     // `osd create` does.
8887     return 0;
8888   }
8889
8890   /*
8891    * This function will be used to validate whether we are able to
8892    * create a new osd when the `uuid` is specified.
8893    *
8894    * It will be used by both `osd create` and `osd new`, as the checks
8895    * are basically the same when it pertains to osd id and uuid validation.
8896    * However, `osd create` presumes an `uuid` is optional, for legacy
8897    * reasons, while `osd new` requires the `uuid` to be provided. This
8898    * means that `osd create` will not be idempotent if an `uuid` is not
8899    * provided, but we will always guarantee the idempotency of `osd new`.
8900    */
8901
8902   ceph_assert(!uuid.is_zero());
8903   if (pending_inc.identify_osd(uuid) >= 0) {
8904     // osd is about to exist
8905     return -EAGAIN;
8906   }
8907
8908   int32_t i = osdmap.identify_osd(uuid);
8909   if (i >= 0) {
8910     // osd already exists
8911     if (id >= 0 && i != id) {
8912       ss << "uuid " << uuid << " already in use for different id " << i;
8913       return -EEXIST;
8914     }
8915     // return a positive errno to distinguish between a blocking error
8916     // and an error we consider to not be a problem (i.e., this would be
8917     // an idempotent operation).
8918     *existing_id = i;
8919     return EEXIST;
8920   }
8921   // i < 0
8922   if (id >= 0) {
8923     if (pending_inc.new_state.count(id)) {
8924       // osd is about to exist
8925       return -EAGAIN;
8926     }
8927     // we may not care if an osd exists if we are recreating a previously
8928     // destroyed osd.
8929     if (check_osd_exists && osdmap.exists(id)) {
8930       ss << "id " << id << " already in use and does not match uuid "
8931          << uuid;
8932       return -EINVAL;
8933     }
8934   }
8935   return 0;
8936 }
8937
8938 int OSDMonitor::prepare_command_osd_create(
8939     const int32_t id,
8940     const uuid_d& uuid,
8941     int32_t* existing_id,
8942     stringstream& ss)
8943 {
8944   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
8945   ceph_assert(existing_id);
8946   if (osdmap.is_destroyed(id)) {
8947     ss << "ceph osd create has been deprecated. Please use ceph osd new "
8948           "instead.";
8949     return -EINVAL;
8950   }
8951
8952   if (uuid.is_zero()) {
8953     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
8954   }
8955
8956   return validate_osd_create(id, uuid, true, existing_id, ss);
8957 }
8958
8959 int OSDMonitor::prepare_command_osd_new(
8960     MonOpRequestRef op,
8961     const cmdmap_t& cmdmap,
8962     const map<string,string>& params,
8963     stringstream &ss,
8964     Formatter *f)
8965 {
8966   uuid_d uuid;
8967   string uuidstr;
8968   int64_t id = -1;
8969
8970   ceph_assert(paxos->is_plugged());
8971
8972   dout(10) << __func__ << " " << op << dendl;
8973
8974   /* validate command. abort now if something's wrong. */
8975
8976   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
8977    *
8978    * If `id` is not specified, we will identify any existing osd based
8979    * on `uuid`. Operation will be idempotent iff secrets match.
8980    *
8981    * If `id` is specified, we will identify any existing osd based on
8982    * `uuid` and match against `id`. If they match, operation will be
8983    * idempotent iff secrets match.
8984    *
8985    * `-i secrets.json` will be optional. If supplied, will be used
8986    * to check for idempotency when `id` and `uuid` match.
8987    *
8988    * If `id` is not specified, and `uuid` does not exist, an id will
8989    * be found or allocated for the osd.
8990    *
8991    * If `id` is specified, and the osd has been previously marked
8992    * as destroyed, then the `id` will be reused.
8993    */
8994   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
8995     ss << "requires the OSD's UUID to be specified.";
8996     return -EINVAL;
8997   } else if (!uuid.parse(uuidstr.c_str())) {
8998     ss << "invalid UUID value '" << uuidstr << "'.";
8999     return -EINVAL;
9000   }
9001
9002   if (cmd_getval(cmdmap, "id", id) &&
9003       (id < 0)) {
9004     ss << "invalid OSD id; must be greater or equal than zero.";
9005     return -EINVAL;
9006   }
9007
9008   // are we running an `osd create`-like command, or recreating
9009   // a previously destroyed osd?
9010
9011   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9012
9013   // we will care about `id` to assess whether osd is `destroyed`, or
9014   // to create a new osd.
9015   // we will need an `id` by the time we reach auth.
9016
9017   int32_t existing_id = -1;
9018   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9019                                 &existing_id, ss);
9020
9021   bool may_be_idempotent = false;
9022   if (err == EEXIST) {
9023     // this is idempotent from the osdmon's point-of-view
9024     may_be_idempotent = true;
9025     ceph_assert(existing_id >= 0);
9026     id = existing_id;
9027   } else if (err < 0) {
9028     return err;
9029   }
9030
9031   if (!may_be_idempotent) {
9032     // idempotency is out of the window. We are either creating a new
9033     // osd or recreating a destroyed osd.
9034     //
9035     // We now need to figure out if we have an `id` (and if it's valid),
9036     // of find an `id` if we don't have one.
9037
9038     // NOTE: we need to consider the case where the `id` is specified for
9039     // `osd create`, and we must honor it. So this means checking if
9040     // the `id` is destroyed, and if so assume the destroy; otherwise,
9041     // check if it `exists` - in which case we complain about not being
9042     // `destroyed`. In the end, if nothing fails, we must allow the
9043     // creation, so that we are compatible with `create`.
9044     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9045       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9046       ss << "OSD " << id << " has not yet been destroyed";
9047       return -EINVAL;
9048     } else if (id < 0) {
9049       // find an `id`
9050       id = _allocate_osd_id(&existing_id);
9051       if (id < 0) {
9052         ceph_assert(existing_id >= 0);
9053         id = existing_id;
9054       }
9055       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9056     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9057       dout(10) << __func__ << " recreating osd." << id << dendl;
9058     } else {
9059       dout(10) << __func__ << " creating new osd." << id << dendl;
9060     }
9061   } else {
9062     ceph_assert(id >= 0);
9063     ceph_assert(osdmap.exists(id));
9064   }
9065
9066   // we are now able to either create a brand new osd or reuse an existing
9067   // osd that has been previously destroyed.
9068
9069   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9070
9071   if (may_be_idempotent && params.empty()) {
9072     // nothing to do, really.
9073     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9074     ceph_assert(id >= 0);
9075     if (f) {
9076       f->open_object_section("created_osd");
9077       f->dump_int("osdid", id);
9078       f->close_section();
9079     } else {
9080       ss << id;
9081     }
9082     return EEXIST;
9083   }
9084
9085   string device_class;
9086   auto p = params.find("crush_device_class");
9087   if (p != params.end()) {
9088     device_class = p->second;
9089     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9090   }
9091   string cephx_secret, lockbox_secret, dmcrypt_key;
9092   bool has_lockbox = false;
9093   bool has_secrets = params.count("cephx_secret")
9094     || params.count("cephx_lockbox_secret")
9095     || params.count("dmcrypt_key");
9096
9097   ConfigKeyService *svc = nullptr;
9098   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9099
9100   if (has_secrets) {
9101     if (params.count("cephx_secret") == 0) {
9102       ss << "requires a cephx secret.";
9103       return -EINVAL;
9104     }
9105     cephx_secret = params.at("cephx_secret");
9106
9107     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9108     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9109
9110     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9111              << " dmcrypt " << has_dmcrypt_key << dendl;
9112
9113     if (has_lockbox_secret && has_dmcrypt_key) {
9114       has_lockbox = true;
9115       lockbox_secret = params.at("cephx_lockbox_secret");
9116       dmcrypt_key = params.at("dmcrypt_key");
9117     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9118       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9119       return -EINVAL;
9120     }
9121
9122     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9123
9124     err = mon->authmon()->validate_osd_new(id, uuid,
9125         cephx_secret,
9126         lockbox_secret,
9127         cephx_entity,
9128         lockbox_entity,
9129         ss);
9130     if (err < 0) {
9131       return err;
9132     } else if (may_be_idempotent && err != EEXIST) {
9133       // for this to be idempotent, `id` should already be >= 0; no need
9134       // to use validate_id.
9135       ceph_assert(id >= 0);
9136       ss << "osd." << id << " exists but secrets do not match";
9137       return -EEXIST;
9138     }
9139
9140     if (has_lockbox) {
9141       svc = (ConfigKeyService*)mon->config_key_service;
9142       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9143       if (err < 0) {
9144         return err;
9145       } else if (may_be_idempotent && err != EEXIST) {
9146         ceph_assert(id >= 0);
9147         ss << "osd." << id << " exists but dm-crypt key does not match.";
9148         return -EEXIST;
9149       }
9150     }
9151   }
9152   ceph_assert(!has_secrets || !cephx_secret.empty());
9153   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9154
9155   if (may_be_idempotent) {
9156     // we have nothing to do for either the osdmon or the authmon,
9157     // and we have no lockbox - so the config key service will not be
9158     // touched. This is therefore an idempotent operation, and we can
9159     // just return right away.
9160     dout(10) << __func__ << " idempotent -- no op." << dendl;
9161     ceph_assert(id >= 0);
9162     if (f) {
9163       f->open_object_section("created_osd");
9164       f->dump_int("osdid", id);
9165       f->close_section();
9166     } else {
9167       ss << id;
9168     }
9169     return EEXIST;
9170   }
9171   ceph_assert(!may_be_idempotent);
9172
9173   // perform updates.
9174   if (has_secrets) {
9175     ceph_assert(!cephx_secret.empty());
9176     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9177            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9178
9179     err = mon->authmon()->do_osd_new(cephx_entity,
9180         lockbox_entity,
9181         has_lockbox);
9182     ceph_assert(0 == err);
9183
9184     if (has_lockbox) {
9185       ceph_assert(nullptr != svc);
9186       svc->do_osd_new(uuid, dmcrypt_key);
9187     }
9188   }
9189
9190   if (is_recreate_destroyed) {
9191     ceph_assert(id >= 0);
9192     ceph_assert(osdmap.is_destroyed(id));
9193     pending_inc.new_weight[id] = CEPH_OSD_OUT;
9194     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9195     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9196       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9197     }
9198     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9199       // due to http://tracker.ceph.com/issues/20751 some clusters may
9200       // have UP set for non-existent OSDs; make sure it is cleared
9201       // for a newly created osd.
9202       pending_inc.new_state[id] |= CEPH_OSD_UP;
9203     }
9204     pending_inc.new_uuid[id] = uuid;
9205   } else {
9206     ceph_assert(id >= 0);
9207     int32_t new_id = -1;
9208     do_osd_create(id, uuid, device_class, &new_id);
9209     ceph_assert(new_id >= 0);
9210     ceph_assert(id == new_id);
9211   }
9212
9213   if (f) {
9214     f->open_object_section("created_osd");
9215     f->dump_int("osdid", id);
9216     f->close_section();
9217   } else {
9218     ss << id;
9219   }
9220
9221   return 0;
9222 }
9223
9224 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9225 {
9226   op->mark_osdmon_event(__func__);
9227   auto m = op->get_req<MMonCommand>();
9228   stringstream ss;
9229   cmdmap_t cmdmap;
9230   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9231     string rs = ss.str();
9232     mon->reply_command(op, -EINVAL, rs, get_last_committed());
9233     return true;
9234   }
9235
9236   MonSession *session = op->get_session();
9237   if (!session) {
9238     derr << __func__ << " no session" << dendl;
9239     mon->reply_command(op, -EACCES, "access denied", get_last_committed());
9240     return true;
9241   }
9242
9243   return prepare_command_impl(op, cmdmap);
9244 }
9245
9246 static int parse_reweights(CephContext *cct,
9247                            const cmdmap_t& cmdmap,
9248                            const OSDMap& osdmap,
9249                            map<int32_t, uint32_t>* weights)
9250 {
9251   string weights_str;
9252   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9253     return -EINVAL;
9254   }
9255   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9256   json_spirit::mValue json_value;
9257   if (!json_spirit::read(weights_str, json_value)) {
9258     return -EINVAL;
9259   }
9260   if (json_value.type() != json_spirit::obj_type) {
9261     return -EINVAL;
9262   }
9263   const auto obj = json_value.get_obj();
9264   try {
9265     for (auto& osd_weight : obj) {
9266       auto osd_id = std::stoi(osd_weight.first);
9267       if (!osdmap.exists(osd_id)) {
9268         return -ENOENT;
9269       }
9270       if (osd_weight.second.type() != json_spirit::str_type) {
9271         return -EINVAL;
9272       }
9273       auto weight = std::stoul(osd_weight.second.get_str());
9274       weights->insert({osd_id, weight});
9275     }
9276   } catch (const std::logic_error& e) {
9277     return -EINVAL;
9278   }
9279   return 0;
9280 }
9281
9282 int OSDMonitor::prepare_command_osd_destroy(
9283     int32_t id,
9284     stringstream& ss)
9285 {
9286   ceph_assert(paxos->is_plugged());
9287
9288   // we check if the osd exists for the benefit of `osd purge`, which may
9289   // have previously removed the osd. If the osd does not exist, return
9290   // -ENOENT to convey this, and let the caller deal with it.
9291   //
9292   // we presume that all auth secrets and config keys were removed prior
9293   // to this command being called. if they exist by now, we also assume
9294   // they must have been created by some other command and do not pertain
9295   // to this non-existent osd.
9296   if (!osdmap.exists(id)) {
9297     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9298     return -ENOENT;
9299   }
9300
9301   uuid_d uuid = osdmap.get_uuid(id);
9302   dout(10) << __func__ << " destroying osd." << id
9303            << " uuid " << uuid << dendl;
9304
9305   // if it has been destroyed, we assume our work here is done.
9306   if (osdmap.is_destroyed(id)) {
9307     ss << "destroyed osd." << id;
9308     return 0;
9309   }
9310
9311   EntityName cephx_entity, lockbox_entity;
9312   bool idempotent_auth = false, idempotent_cks = false;
9313
9314   int err = mon->authmon()->validate_osd_destroy(id, uuid,
9315                                                  cephx_entity,
9316                                                  lockbox_entity,
9317                                                  ss);
9318   if (err < 0) {
9319     if (err == -ENOENT) {
9320       idempotent_auth = true;
9321     } else {
9322       return err;
9323     }
9324   }
9325
9326   ConfigKeyService *svc = (ConfigKeyService*)mon->config_key_service;
9327   err = svc->validate_osd_destroy(id, uuid);
9328   if (err < 0) {
9329     ceph_assert(err == -ENOENT);
9330     err = 0;
9331     idempotent_cks = true;
9332   }
9333
9334   if (!idempotent_auth) {
9335     err = mon->authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9336     ceph_assert(0 == err);
9337   }
9338
9339   if (!idempotent_cks) {
9340     svc->do_osd_destroy(id, uuid);
9341   }
9342
9343   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9344   pending_inc.new_uuid[id] = uuid_d();
9345
9346   // we can only propose_pending() once per service, otherwise we'll be
9347   // defying PaxosService and all laws of nature. Therefore, as we may
9348   // be used during 'osd purge', let's keep the caller responsible for
9349   // proposing.
9350   ceph_assert(err == 0);
9351   return 0;
9352 }
9353
9354 int OSDMonitor::prepare_command_osd_purge(
9355     int32_t id,
9356     stringstream& ss)
9357 {
9358   ceph_assert(paxos->is_plugged());
9359   dout(10) << __func__ << " purging osd." << id << dendl;
9360
9361   ceph_assert(!osdmap.is_up(id));
9362
9363   /*
9364    * This may look a bit weird, but this is what's going to happen:
9365    *
9366    *  1. we make sure that removing from crush works
9367    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9368    *     error, then we abort the whole operation, as no updates
9369    *     have been made. However, we this function will have
9370    *     side-effects, thus we need to make sure that all operations
9371    *     performed henceforth will *always* succeed.
9372    *  3. we call `prepare_command_osd_remove()`. Although this
9373    *     function can return an error, it currently only checks if the
9374    *     osd is up - and we have made sure that it is not so, so there
9375    *     is no conflict, and it is effectively an update.
9376    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9377    *     the crush update we delayed from before.
9378    */
9379
9380   CrushWrapper newcrush;
9381   _get_pending_crush(newcrush);
9382
9383   bool may_be_idempotent = false;
9384
9385   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9386   if (err == -ENOENT) {
9387     err = 0;
9388     may_be_idempotent = true;
9389   } else if (err < 0) {
9390     ss << "error removing osd." << id << " from crush";
9391     return err;
9392   }
9393
9394   // no point destroying the osd again if it has already been marked destroyed
9395   if (!osdmap.is_destroyed(id)) {
9396     err = prepare_command_osd_destroy(id, ss);
9397     if (err < 0) {
9398       if (err == -ENOENT) {
9399         err = 0;
9400       } else {
9401         return err;
9402       }
9403     } else {
9404       may_be_idempotent = false;
9405     }
9406   }
9407   ceph_assert(0 == err);
9408
9409   if (may_be_idempotent && !osdmap.exists(id)) {
9410     dout(10) << __func__ << " osd." << id << " does not exist and "
9411              << "we are idempotent." << dendl;
9412     return -ENOENT;
9413   }
9414
9415   err = prepare_command_osd_remove(id);
9416   // we should not be busy, as we should have made sure this id is not up.
9417   ceph_assert(0 == err);
9418
9419   do_osd_crush_remove(newcrush);
9420   return 0;
9421 }
9422
9423 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9424                                       const cmdmap_t& cmdmap)
9425 {
9426   op->mark_osdmon_event(__func__);
9427   auto m = op->get_req<MMonCommand>();
9428   bool ret = false;
9429   stringstream ss;
9430   string rs;
9431   bufferlist rdata;
9432   int err = 0;
9433
9434   string format;
9435   cmd_getval(cmdmap, "format", format, string("plain"));
9436   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9437
9438   string prefix;
9439   cmd_getval(cmdmap, "prefix", prefix);
9440
9441   int64_t osdid;
9442   string osd_name;
9443   bool osdid_present = false;
9444   if (prefix != "osd pg-temp" &&
9445       prefix != "osd pg-upmap" &&
9446       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9447     osdid_present = cmd_getval(cmdmap, "id", osdid);
9448   }
9449   if (osdid_present) {
9450     ostringstream oss;
9451     oss << "osd." << osdid;
9452     osd_name = oss.str();
9453   }
9454
9455   // Even if there's a pending state with changes that could affect
9456   // a command, considering that said state isn't yet committed, we
9457   // just don't care about those changes if the command currently being
9458   // handled acts as a no-op against the current committed state.
9459   // In a nutshell, we assume this command  happens *before*.
9460   //
9461   // Let me make this clearer:
9462   //
9463   //   - If we have only one client, and that client issues some
9464   //     operation that would conflict with this operation  but is
9465   //     still on the pending state, then we would be sure that said
9466   //     operation wouldn't have returned yet, so the client wouldn't
9467   //     issue this operation (unless the client didn't wait for the
9468   //     operation to finish, and that would be the client's own fault).
9469   //
9470   //   - If we have more than one client, each client will observe
9471   //     whatever is the state at the moment of the commit.  So, if we
9472   //     have two clients, one issuing an unlink and another issuing a
9473   //     link, and if the link happens while the unlink is still on the
9474   //     pending state, from the link's point-of-view this is a no-op.
9475   //     If different clients are issuing conflicting operations and
9476   //     they care about that, then the clients should make sure they
9477   //     enforce some kind of concurrency mechanism -- from our
9478   //     perspective that's what Douglas Adams would call an SEP.
9479   //
9480   // This should be used as a general guideline for most commands handled
9481   // in this function.  Adapt as you see fit, but please bear in mind that
9482   // this is the expected behavior.
9483
9484
9485   if (prefix == "osd setcrushmap" ||
9486       (prefix == "osd crush set" && !osdid_present)) {
9487     if (pending_inc.crush.length()) {
9488       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9489       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9490       return true;
9491     }
9492     dout(10) << "prepare_command setting new crush map" << dendl;
9493     bufferlist data(m->get_data());
9494     CrushWrapper crush;
9495     try {
9496       auto bl = data.cbegin();
9497       crush.decode(bl);
9498     }
9499     catch (const std::exception &e) {
9500       err = -EINVAL;
9501       ss << "Failed to parse crushmap: " << e.what();
9502       goto reply;
9503     }
9504
9505     int64_t prior_version = 0;
9506     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9507       if (prior_version == osdmap.get_crush_version() - 1) {
9508         // see if we are a resend of the last update.  this is imperfect
9509         // (multiple racing updaters may not both get reliable success)
9510         // but we expect crush updaters (via this interface) to be rare-ish.
9511         bufferlist current, proposed;
9512         osdmap.crush->encode(current, mon->get_quorum_con_features());
9513         crush.encode(proposed, mon->get_quorum_con_features());
9514         if (current.contents_equal(proposed)) {
9515           dout(10) << __func__
9516                    << " proposed matches current and version equals previous"
9517                    << dendl;
9518           err = 0;
9519           ss << osdmap.get_crush_version();
9520           goto reply;
9521         }
9522       }
9523       if (prior_version != osdmap.get_crush_version()) {
9524         err = -EPERM;
9525         ss << "prior_version " << prior_version << " != crush version "
9526            << osdmap.get_crush_version();
9527         goto reply;
9528       }
9529     }
9530
9531     if (crush.has_legacy_rule_ids()) {
9532       err = -EINVAL;
9533       ss << "crush maps with ruleset != ruleid are no longer allowed";
9534       goto reply;
9535     }
9536     if (!validate_crush_against_features(&crush, ss)) {
9537       err = -EINVAL;
9538       goto reply;
9539     }
9540
9541     err = osdmap.validate_crush_rules(&crush, &ss);
9542     if (err < 0) {
9543       goto reply;
9544     }
9545
9546     if (g_conf()->mon_osd_crush_smoke_test) {
9547       // sanity check: test some inputs to make sure this map isn't
9548       // totally broken
9549       dout(10) << " testing map" << dendl;
9550       stringstream ess;
9551       CrushTester tester(crush, ess);
9552       tester.set_min_x(0);
9553       tester.set_max_x(50);
9554       auto start = ceph::coarse_mono_clock::now();
9555       int r = tester.test_with_fork(g_conf()->mon_lease);
9556       auto duration = ceph::coarse_mono_clock::now() - start;
9557       if (r < 0) {
9558         dout(10) << " tester.test_with_fork returns " << r
9559                  << ": " << ess.str() << dendl;
9560         ss << "crush smoke test failed with " << r << ": " << ess.str();
9561         err = r;
9562         goto reply;
9563       }
9564       dout(10) << __func__ << " crush somke test duration: "
9565                << duration << ", result: " << ess.str() << dendl;
9566     }
9567
9568     pending_inc.crush = data;
9569     ss << osdmap.get_crush_version() + 1;
9570     goto update;
9571
9572   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9573     CrushWrapper newcrush;
9574     _get_pending_crush(newcrush);
9575     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9576       int bid = -1 - b;
9577       if (newcrush.bucket_exists(bid) &&
9578           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9579         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9580         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9581       }
9582     }
9583     if (!validate_crush_against_features(&newcrush, ss)) {
9584       err = -EINVAL;
9585       goto reply;
9586     }
9587     pending_inc.crush.clear();
9588     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9589     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9590                                               get_last_committed() + 1));
9591     return true;
9592   } else if (prefix == "osd crush set-device-class") {
9593     string device_class;
9594     if (!cmd_getval(cmdmap, "class", device_class)) {
9595       err = -EINVAL; // no value!
9596       goto reply;
9597     }
9598
9599     bool stop = false;
9600     vector<string> idvec;
9601     cmd_getval(cmdmap, "ids", idvec);
9602     CrushWrapper newcrush;
9603     _get_pending_crush(newcrush);
9604     set<int> updated;
9605     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9606       set<int> osds;
9607       // wildcard?
9608       if (j == 0 &&
9609           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9610         osdmap.get_all_osds(osds);
9611         stop = true;
9612       } else {
9613         // try traditional single osd way
9614         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9615         if (osd < 0) {
9616           // ss has reason for failure
9617           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9618           err = -EINVAL;
9619           continue;
9620         }
9621         osds.insert(osd);
9622       }
9623
9624       for (auto &osd : osds) {
9625         if (!osdmap.exists(osd)) {
9626           ss << "osd." << osd << " does not exist. ";
9627           continue;
9628         }
9629
9630         ostringstream oss;
9631         oss << "osd." << osd;
9632         string name = oss.str();
9633
9634         if (newcrush.get_max_devices() < osd + 1) {
9635           newcrush.set_max_devices(osd + 1);
9636         }
9637         string action;
9638         if (newcrush.item_exists(osd)) {
9639           action = "updating";
9640         } else {
9641           action = "creating";
9642           newcrush.set_item_name(osd, name);
9643         }
9644
9645         dout(5) << action << " crush item id " << osd << " name '" << name
9646                 << "' device_class '" << device_class << "'"
9647                 << dendl;
9648         err = newcrush.update_device_class(osd, device_class, name, &ss);
9649         if (err < 0) {
9650           goto reply;
9651         }
9652         if (err == 0 && !_have_pending_crush()) {
9653           if (!stop) {
9654             // for single osd only, wildcard makes too much noise
9655             ss << "set-device-class item id " << osd << " name '" << name
9656                << "' device_class '" << device_class << "': no change. ";
9657           }
9658         } else {
9659           updated.insert(osd);
9660         }
9661       }
9662     }
9663
9664     if (!updated.empty()) {
9665       pending_inc.crush.clear();
9666       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9667       ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9668       getline(ss, rs);
9669       wait_for_finished_proposal(op,
9670         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9671       return true;
9672     }
9673
9674  } else if (prefix == "osd crush rm-device-class") {
9675     bool stop = false;
9676     vector<string> idvec;
9677     cmd_getval(cmdmap, "ids", idvec);
9678     CrushWrapper newcrush;
9679     _get_pending_crush(newcrush);
9680     set<int> updated;
9681
9682     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9683       set<int> osds;
9684
9685       // wildcard?
9686       if (j == 0 &&
9687           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9688         osdmap.get_all_osds(osds);
9689         stop = true;
9690       } else {
9691         // try traditional single osd way
9692         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9693         if (osd < 0) {
9694           // ss has reason for failure
9695           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9696           err = -EINVAL;
9697           goto reply;
9698         }
9699         osds.insert(osd);
9700       }
9701
9702       for (auto &osd : osds) {
9703         if (!osdmap.exists(osd)) {
9704           ss << "osd." << osd << " does not exist. ";
9705           continue;
9706         }
9707
9708         auto class_name = newcrush.get_item_class(osd);
9709         if (!class_name) {
9710           ss << "osd." << osd << " belongs to no class, ";
9711           continue;
9712         }
9713         // note that we do not verify if class_is_in_use here
9714         // in case the device is misclassified and user wants
9715         // to overridely reset...
9716
9717         err = newcrush.remove_device_class(cct, osd, &ss);
9718         if (err < 0) {
9719           // ss has reason for failure
9720           goto reply;
9721         }
9722         updated.insert(osd);
9723       }
9724     }
9725
9726     if (!updated.empty()) {
9727       pending_inc.crush.clear();
9728       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9729       ss << "done removing class of osd(s): " << updated;
9730       getline(ss, rs);
9731       wait_for_finished_proposal(op,
9732         new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9733       return true;
9734     }
9735   } else if (prefix == "osd crush class create") {
9736     string device_class;
9737     if (!cmd_getval(cmdmap, "class", device_class)) {
9738       err = -EINVAL; // no value!
9739       goto reply;
9740     }
9741     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9742       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9743          << "luminous' before using crush device classes";
9744       err = -EPERM;
9745       goto reply;
9746     }
9747     if (!_have_pending_crush() &&
9748         _get_stable_crush().class_exists(device_class)) {
9749       ss << "class '" << device_class << "' already exists";
9750       goto reply;
9751     }
9752      CrushWrapper newcrush;
9753     _get_pending_crush(newcrush);
9754      if (newcrush.class_exists(device_class)) {
9755       ss << "class '" << device_class << "' already exists";
9756       goto update;
9757     }
9758     int class_id = newcrush.get_or_create_class_id(device_class);
9759     pending_inc.crush.clear();
9760     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9761     ss << "created class " << device_class << " with id " << class_id
9762        << " to crush map";
9763     goto update;
9764   } else if (prefix == "osd crush class rm") {
9765     string device_class;
9766     if (!cmd_getval(cmdmap, "class", device_class)) {
9767        err = -EINVAL; // no value!
9768        goto reply;
9769      }
9770     if (osdmap.require_osd_release < ceph_release_t::luminous) {
9771        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
9772          << "luminous' before using crush device classes";
9773        err = -EPERM;
9774        goto reply;
9775      }
9776
9777      if (!osdmap.crush->class_exists(device_class)) {
9778        err = 0;
9779        goto reply;
9780      }
9781
9782      CrushWrapper newcrush;
9783      _get_pending_crush(newcrush);
9784      if (!newcrush.class_exists(device_class)) {
9785        err = 0; // make command idempotent
9786        goto wait;
9787      }
9788      int class_id = newcrush.get_class_id(device_class);
9789      stringstream ts;
9790      if (newcrush.class_is_in_use(class_id, &ts)) {
9791        err = -EBUSY;
9792        ss << "class '" << device_class << "' " << ts.str();
9793        goto reply;
9794      }
9795
9796      // check if class is used by any erasure-code-profiles
9797      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
9798        osdmap.get_erasure_code_profiles();
9799      auto ec_profiles = pending_inc.get_erasure_code_profiles();
9800 #ifdef HAVE_STDLIB_MAP_SPLICING
9801      ec_profiles.merge(old_ec_profiles);
9802 #else
9803      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
9804                         make_move_iterator(end(old_ec_profiles)));
9805 #endif
9806      list<string> referenced_by;
9807      for (auto &i: ec_profiles) {
9808        for (auto &j: i.second) {
9809          if ("crush-device-class" == j.first && device_class == j.second) {
9810            referenced_by.push_back(i.first);
9811          }
9812        }
9813      }
9814      if (!referenced_by.empty()) {
9815        err = -EBUSY;
9816        ss << "class '" << device_class
9817           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
9818        goto reply;
9819      }
9820
9821      set<int> osds;
9822      newcrush.get_devices_by_class(device_class, &osds);
9823      for (auto& p: osds) {
9824        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
9825        if (err < 0) {
9826          // ss has reason for failure
9827          goto reply;
9828        }
9829      }
9830
9831      if (osds.empty()) {
9832        // empty class, remove directly
9833        err = newcrush.remove_class_name(device_class);
9834        if (err < 0) {
9835          ss << "class '" << device_class << "' cannot be removed '"
9836             << cpp_strerror(err) << "'";
9837          goto reply;
9838        }
9839      }
9840
9841      pending_inc.crush.clear();
9842      newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9843      ss << "removed class " << device_class << " with id " << class_id
9844         << " from crush map";
9845      goto update;
9846   } else if (prefix == "osd crush class rename") {
9847     string srcname, dstname;
9848     if (!cmd_getval(cmdmap, "srcname", srcname)) {
9849       err = -EINVAL;
9850       goto reply;
9851     }
9852     if (!cmd_getval(cmdmap, "dstname", dstname)) {
9853       err = -EINVAL;
9854       goto reply;
9855     }
9856
9857     CrushWrapper newcrush;
9858     _get_pending_crush(newcrush);
9859     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
9860       // suppose this is a replay and return success
9861       // so command is idempotent
9862       ss << "already renamed to '" << dstname << "'";
9863       err = 0;
9864       goto reply;
9865     }
9866
9867     err = newcrush.rename_class(srcname, dstname);
9868     if (err < 0) {
9869       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
9870          << cpp_strerror(err);
9871       goto reply;
9872     }
9873
9874     pending_inc.crush.clear();
9875     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9876     ss << "rename class '" << srcname << "' to '" << dstname << "'";
9877     goto update;
9878   } else if (prefix == "osd crush add-bucket") {
9879     // os crush add-bucket <name> <type>
9880     string name, typestr;
9881     vector<string> argvec;
9882     cmd_getval(cmdmap, "name", name);
9883     cmd_getval(cmdmap, "type", typestr);
9884     cmd_getval(cmdmap, "args", argvec);
9885     map<string,string> loc;
9886     if (!argvec.empty()) {
9887       CrushWrapper::parse_loc_map(argvec, &loc);
9888       dout(0) << "will create and move bucket '" << name
9889               << "' to location " << loc << dendl;
9890     }
9891
9892     if (!_have_pending_crush() &&
9893         _get_stable_crush().name_exists(name)) {
9894       ss << "bucket '" << name << "' already exists";
9895       goto reply;
9896     }
9897
9898     CrushWrapper newcrush;
9899     _get_pending_crush(newcrush);
9900
9901     if (newcrush.name_exists(name)) {
9902       ss << "bucket '" << name << "' already exists";
9903       goto update;
9904     }
9905     int type = newcrush.get_type_id(typestr);
9906     if (type < 0) {
9907       ss << "type '" << typestr << "' does not exist";
9908       err = -EINVAL;
9909       goto reply;
9910     }
9911     if (type == 0) {
9912       ss << "type '" << typestr << "' is for devices, not buckets";
9913       err = -EINVAL;
9914       goto reply;
9915     }
9916     int bucketno;
9917     err = newcrush.add_bucket(0, 0,
9918                               CRUSH_HASH_DEFAULT, type, 0, NULL,
9919                               NULL, &bucketno);
9920     if (err < 0) {
9921       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
9922       goto reply;
9923     }
9924     err = newcrush.set_item_name(bucketno, name);
9925     if (err < 0) {
9926       ss << "error setting bucket name to '" << name << "'";
9927       goto reply;
9928     }
9929
9930     if (!loc.empty()) {
9931       if (!newcrush.check_item_loc(cct, bucketno, loc,
9932           (int *)NULL)) {
9933         err = newcrush.move_bucket(cct, bucketno, loc);
9934         if (err < 0) {
9935           ss << "error moving bucket '" << name << "' to location " << loc;
9936           goto reply;
9937         }
9938       } else {
9939         ss << "no need to move item id " << bucketno << " name '" << name
9940            << "' to location " << loc << " in crush map";
9941       }
9942     }
9943
9944     pending_inc.crush.clear();
9945     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
9946     if (loc.empty()) {
9947       ss << "added bucket " << name << " type " << typestr
9948          << " to crush map";
9949     } else {
9950       ss << "added bucket " << name << " type " << typestr
9951          << " to location " << loc;
9952     }
9953     goto update;
9954   } else if (prefix == "osd crush rename-bucket") {
9955     string srcname, dstname;
9956     cmd_getval(cmdmap, "srcname", srcname);
9957     cmd_getval(cmdmap, "dstname", dstname);
9958
9959     err = crush_rename_bucket(srcname, dstname, &ss);
9960     if (err == -EALREADY) // equivalent to success for idempotency
9961       err = 0;
9962     if (err)
9963       goto reply;
9964     else
9965       goto update;
9966   } else if (prefix == "osd crush weight-set create" ||
9967              prefix == "osd crush weight-set create-compat") {
9968     CrushWrapper newcrush;
9969     _get_pending_crush(newcrush);
9970     int64_t pool;
9971     int positions;
9972     if (newcrush.has_non_straw2_buckets()) {
9973       ss << "crush map contains one or more bucket(s) that are not straw2";
9974       err = -EPERM;
9975       goto reply;
9976     }
9977     if (prefix == "osd crush weight-set create") {
9978       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
9979           osdmap.require_min_compat_client < ceph_release_t::luminous) {
9980         ss << "require_min_compat_client "
9981            << osdmap.require_min_compat_client
9982            << " < luminous, which is required for per-pool weight-sets. "
9983            << "Try 'ceph osd set-require-min-compat-client luminous' "
9984            << "before using the new interface";
9985         err = -EPERM;
9986         goto reply;
9987       }
9988       string poolname, mode;
9989       cmd_getval(cmdmap, "pool", poolname);
9990       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
9991       if (pool < 0) {
9992         ss << "pool '" << poolname << "' not found";
9993         err = -ENOENT;
9994         goto reply;
9995       }
9996       cmd_getval(cmdmap, "mode", mode);
9997       if (mode != "flat" && mode != "positional") {
9998         ss << "unrecognized weight-set mode '" << mode << "'";
9999         err = -EINVAL;
10000         goto reply;
10001       }
10002       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10003     } else {
10004       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10005       positions = 1;
10006     }
10007     if (!newcrush.create_choose_args(pool, positions)) {
10008       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10009         ss << "compat weight-set already created";
10010       } else {
10011         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10012            << "' already created";
10013       }
10014       goto reply;
10015     }
10016     pending_inc.crush.clear();
10017     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10018     goto update;
10019
10020   } else if (prefix == "osd crush weight-set rm" ||
10021              prefix == "osd crush weight-set rm-compat") {
10022     CrushWrapper newcrush;
10023     _get_pending_crush(newcrush);
10024     int64_t pool;
10025     if (prefix == "osd crush weight-set rm") {
10026       string poolname;
10027       cmd_getval(cmdmap, "pool", poolname);
10028       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10029       if (pool < 0) {
10030         ss << "pool '" << poolname << "' not found";
10031         err = -ENOENT;
10032         goto reply;
10033       }
10034     } else {
10035       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10036     }
10037     newcrush.rm_choose_args(pool);
10038     pending_inc.crush.clear();
10039     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10040     goto update;
10041
10042   } else if (prefix == "osd crush weight-set reweight" ||
10043              prefix == "osd crush weight-set reweight-compat") {
10044     string poolname, item;
10045     vector<double> weight;
10046     cmd_getval(cmdmap, "pool", poolname);
10047     cmd_getval(cmdmap, "item", item);
10048     cmd_getval(cmdmap, "weight", weight);
10049     CrushWrapper newcrush;
10050     _get_pending_crush(newcrush);
10051     int64_t pool;
10052     if (prefix == "osd crush weight-set reweight") {
10053       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10054       if (pool < 0) {
10055         ss << "pool '" << poolname << "' not found";
10056         err = -ENOENT;
10057         goto reply;
10058       }
10059       if (!newcrush.have_choose_args(pool)) {
10060         ss << "no weight-set for pool '" << poolname << "'";
10061         err = -ENOENT;
10062         goto reply;
10063       }
10064       auto arg_map = newcrush.choose_args_get(pool);
10065       int positions = newcrush.get_choose_args_positions(arg_map);
10066       if (weight.size() != (size_t)positions) {
10067          ss << "must specify exact " << positions << " weight values";
10068          err = -EINVAL;
10069          goto reply;
10070       }
10071     } else {
10072       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10073       if (!newcrush.have_choose_args(pool)) {
10074         ss << "no backward-compatible weight-set";
10075         err = -ENOENT;
10076         goto reply;
10077       }
10078     }
10079     if (!newcrush.name_exists(item)) {
10080       ss << "item '" << item << "' does not exist";
10081       err = -ENOENT;
10082       goto reply;
10083     }
10084     err = newcrush.choose_args_adjust_item_weightf(
10085       cct,
10086       newcrush.choose_args_get(pool),
10087       newcrush.get_item_id(item),
10088       weight,
10089       &ss);
10090     if (err < 0) {
10091       goto reply;
10092     }
10093     err = 0;
10094     pending_inc.crush.clear();
10095     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10096     goto update;
10097   } else if (osdid_present &&
10098              (prefix == "osd crush set" || prefix == "osd crush add")) {
10099     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10100     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10101     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10102
10103     if (!osdmap.exists(osdid)) {
10104       err = -ENOENT;
10105       ss << osd_name
10106          << " does not exist. Create it before updating the crush map";
10107       goto reply;
10108     }
10109
10110     double weight;
10111     if (!cmd_getval(cmdmap, "weight", weight)) {
10112       ss << "unable to parse weight value '"
10113          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10114       err = -EINVAL;
10115       goto reply;
10116     }
10117
10118     string args;
10119     vector<string> argvec;
10120     cmd_getval(cmdmap, "args", argvec);
10121     map<string,string> loc;
10122     CrushWrapper::parse_loc_map(argvec, &loc);
10123
10124     if (prefix == "osd crush set"
10125         && !_get_stable_crush().item_exists(osdid)) {
10126       err = -ENOENT;
10127       ss << "unable to set item id " << osdid << " name '" << osd_name
10128          << "' weight " << weight << " at location " << loc
10129          << ": does not exist";
10130       goto reply;
10131     }
10132
10133     dout(5) << "adding/updating crush item id " << osdid << " name '"
10134       << osd_name << "' weight " << weight << " at location "
10135       << loc << dendl;
10136     CrushWrapper newcrush;
10137     _get_pending_crush(newcrush);
10138
10139     string action;
10140     if (prefix == "osd crush set" ||
10141         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10142       action = "set";
10143       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10144     } else {
10145       action = "add";
10146       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10147       if (err == 0)
10148         err = 1;
10149     }
10150
10151     if (err < 0)
10152       goto reply;
10153
10154     if (err == 0 && !_have_pending_crush()) {
10155       ss << action << " item id " << osdid << " name '" << osd_name
10156          << "' weight " << weight << " at location " << loc << ": no change";
10157       goto reply;
10158     }
10159
10160     pending_inc.crush.clear();
10161     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10162     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10163        << weight << " at location " << loc << " to crush map";
10164     getline(ss, rs);
10165     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10166                                                       get_last_committed() + 1));
10167     return true;
10168
10169   } else if (prefix == "osd crush create-or-move") {
10170     do {
10171       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10172       if (!osdmap.exists(osdid)) {
10173         err = -ENOENT;
10174         ss << osd_name
10175            << " does not exist.  create it before updating the crush map";
10176         goto reply;
10177       }
10178
10179       double weight;
10180       if (!cmd_getval(cmdmap, "weight", weight)) {
10181         ss << "unable to parse weight value '"
10182            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10183         err = -EINVAL;
10184         goto reply;
10185       }
10186
10187       string args;
10188       vector<string> argvec;
10189       cmd_getval(cmdmap, "args", argvec);
10190       map<string,string> loc;
10191       CrushWrapper::parse_loc_map(argvec, &loc);
10192
10193       dout(0) << "create-or-move crush item name '" << osd_name
10194               << "' initial_weight " << weight << " at location " << loc
10195               << dendl;
10196
10197       CrushWrapper newcrush;
10198       _get_pending_crush(newcrush);
10199
10200       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10201                                          g_conf()->osd_crush_update_weight_set);
10202       if (err == 0) {
10203         ss << "create-or-move updated item name '" << osd_name
10204            << "' weight " << weight
10205            << " at location " << loc << " to crush map";
10206         break;
10207       }
10208       if (err > 0) {
10209         pending_inc.crush.clear();
10210         newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10211         ss << "create-or-move updating item name '" << osd_name
10212            << "' weight " << weight
10213            << " at location " << loc << " to crush map";
10214         getline(ss, rs);
10215         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10216                                                   get_last_committed() + 1));
10217         return true;
10218       }
10219     } while (false);
10220
10221   } else if (prefix == "osd crush move") {
10222     do {
10223       // osd crush move <name> <loc1> [<loc2> ...]
10224       string name;
10225       vector<string> argvec;
10226       cmd_getval(cmdmap, "name", name);
10227       cmd_getval(cmdmap, "args", argvec);
10228       map<string,string> loc;
10229       CrushWrapper::parse_loc_map(argvec, &loc);
10230
10231       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10232       CrushWrapper newcrush;
10233       _get_pending_crush(newcrush);
10234
10235       if (!newcrush.name_exists(name)) {
10236         err = -ENOENT;
10237         ss << "item " << name << " does not exist";
10238         break;
10239       }
10240       int id = newcrush.get_item_id(name);
10241
10242       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10243         if (id >= 0) {
10244           err = newcrush.create_or_move_item(
10245             cct, id, 0, name, loc,
10246             g_conf()->osd_crush_update_weight_set);
10247         } else {
10248           err = newcrush.move_bucket(cct, id, loc);
10249         }
10250         if (err >= 0) {
10251           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10252           pending_inc.crush.clear();
10253           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10254           getline(ss, rs);
10255           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10256                                                    get_last_committed() + 1));
10257           return true;
10258         }
10259       } else {
10260         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10261         err = 0;
10262       }
10263     } while (false);
10264   } else if (prefix == "osd crush swap-bucket") {
10265     string source, dest;
10266     cmd_getval(cmdmap, "source", source);
10267     cmd_getval(cmdmap, "dest", dest);
10268
10269     bool force = false;
10270     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10271
10272     CrushWrapper newcrush;
10273     _get_pending_crush(newcrush);
10274     if (!newcrush.name_exists(source)) {
10275       ss << "source item " << source << " does not exist";
10276       err = -ENOENT;
10277       goto reply;
10278     }
10279     if (!newcrush.name_exists(dest)) {
10280       ss << "dest item " << dest << " does not exist";
10281       err = -ENOENT;
10282       goto reply;
10283     }
10284     int sid = newcrush.get_item_id(source);
10285     int did = newcrush.get_item_id(dest);
10286     int sparent;
10287     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10288       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10289       err = -EPERM;
10290       goto reply;
10291     }
10292     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10293         !force) {
10294       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10295          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10296          << "; pass --yes-i-really-mean-it to proceed anyway";
10297       err = -EPERM;
10298       goto reply;
10299     }
10300     int r = newcrush.swap_bucket(cct, sid, did);
10301     if (r < 0) {
10302       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10303       err = r;
10304       goto reply;
10305     }
10306     ss << "swapped bucket of " << source << " to " << dest;
10307     pending_inc.crush.clear();
10308     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10309     wait_for_finished_proposal(op,
10310                                new Monitor::C_Command(mon, op, err, ss.str(),
10311                                                       get_last_committed() + 1));
10312     return true;
10313   } else if (prefix == "osd crush link") {
10314     // osd crush link <name> <loc1> [<loc2> ...]
10315     string name;
10316     cmd_getval(cmdmap, "name", name);
10317     vector<string> argvec;
10318     cmd_getval(cmdmap, "args", argvec);
10319     map<string,string> loc;
10320     CrushWrapper::parse_loc_map(argvec, &loc);
10321
10322     // Need an explicit check for name_exists because get_item_id returns
10323     // 0 on unfound.
10324     int id = osdmap.crush->get_item_id(name);
10325     if (!osdmap.crush->name_exists(name)) {
10326       err = -ENOENT;
10327       ss << "item " << name << " does not exist";
10328       goto reply;
10329     } else {
10330       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10331     }
10332     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10333       ss << "no need to move item id " << id << " name '" << name
10334          << "' to location " << loc << " in crush map";
10335       err = 0;
10336       goto reply;
10337     }
10338
10339     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10340     CrushWrapper newcrush;
10341     _get_pending_crush(newcrush);
10342
10343     if (!newcrush.name_exists(name)) {
10344       err = -ENOENT;
10345       ss << "item " << name << " does not exist";
10346       goto reply;
10347     } else {
10348       int id = newcrush.get_item_id(name);
10349       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10350         err = newcrush.link_bucket(cct, id, loc);
10351         if (err >= 0) {
10352           ss << "linked item id " << id << " name '" << name
10353              << "' to location " << loc << " in crush map";
10354           pending_inc.crush.clear();
10355           newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10356         } else {
10357           ss << "cannot link item id " << id << " name '" << name
10358              << "' to location " << loc;
10359           goto reply;
10360         }
10361       } else {
10362         ss << "no need to move item id " << id << " name '" << name
10363            << "' to location " << loc << " in crush map";
10364         err = 0;
10365       }
10366     }
10367     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10368                                               get_last_committed() + 1));
10369     return true;
10370   } else if (prefix == "osd crush rm" ||
10371              prefix == "osd crush remove" ||
10372              prefix == "osd crush unlink") {
10373     do {
10374       // osd crush rm <id> [ancestor]
10375       CrushWrapper newcrush;
10376       _get_pending_crush(newcrush);
10377
10378       string name;
10379       cmd_getval(cmdmap, "name", name);
10380
10381       if (!osdmap.crush->name_exists(name)) {
10382         err = 0;
10383         ss << "device '" << name << "' does not appear in the crush map";
10384         break;
10385       }
10386       if (!newcrush.name_exists(name)) {
10387         err = 0;
10388         ss << "device '" << name << "' does not appear in the crush map";
10389         getline(ss, rs);
10390         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10391                                                   get_last_committed() + 1));
10392         return true;
10393       }
10394       int id = newcrush.get_item_id(name);
10395       int ancestor = 0;
10396
10397       bool unlink_only = prefix == "osd crush unlink";
10398       string ancestor_str;
10399       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10400         if (!newcrush.name_exists(ancestor_str)) {
10401           err = -ENOENT;
10402           ss << "ancestor item '" << ancestor_str
10403              << "' does not appear in the crush map";
10404           break;
10405         }
10406         ancestor = newcrush.get_item_id(ancestor_str);
10407       }
10408
10409       err = prepare_command_osd_crush_remove(
10410           newcrush,
10411           id, ancestor,
10412           (ancestor < 0), unlink_only);
10413
10414       if (err == -ENOENT) {
10415         ss << "item " << id << " does not appear in that position";
10416         err = 0;
10417         break;
10418       }
10419       if (err == 0) {
10420         if (!unlink_only)
10421           pending_inc.new_crush_node_flags[id] = 0;
10422         ss << "removed item id " << id << " name '" << name << "' from crush map";
10423         getline(ss, rs);
10424         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10425                                                   get_last_committed() + 1));
10426         return true;
10427       }
10428     } while (false);
10429
10430   } else if (prefix == "osd crush reweight-all") {
10431     CrushWrapper newcrush;
10432     _get_pending_crush(newcrush);
10433
10434     newcrush.reweight(cct);
10435     pending_inc.crush.clear();
10436     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10437     ss << "reweighted crush hierarchy";
10438     getline(ss, rs);
10439     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10440                                                   get_last_committed() + 1));
10441     return true;
10442   } else if (prefix == "osd crush reweight") {
10443     // osd crush reweight <name> <weight>
10444     CrushWrapper newcrush;
10445     _get_pending_crush(newcrush);
10446
10447     string name;
10448     cmd_getval(cmdmap, "name", name);
10449     if (!newcrush.name_exists(name)) {
10450       err = -ENOENT;
10451       ss << "device '" << name << "' does not appear in the crush map";
10452       goto reply;
10453     }
10454
10455     int id = newcrush.get_item_id(name);
10456     if (id < 0) {
10457       ss << "device '" << name << "' is not a leaf in the crush map";
10458       err = -EINVAL;
10459       goto reply;
10460     }
10461     double w;
10462     if (!cmd_getval(cmdmap, "weight", w)) {
10463       ss << "unable to parse weight value '"
10464          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10465       err = -EINVAL;
10466       goto reply;
10467     }
10468
10469     err = newcrush.adjust_item_weightf(cct, id, w,
10470                                        g_conf()->osd_crush_update_weight_set);
10471     if (err < 0)
10472       goto reply;
10473     pending_inc.crush.clear();
10474     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10475     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10476        << " in crush map";
10477     getline(ss, rs);
10478     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10479                                                   get_last_committed() + 1));
10480     return true;
10481   } else if (prefix == "osd crush reweight-subtree") {
10482     // osd crush reweight <name> <weight>
10483     CrushWrapper newcrush;
10484     _get_pending_crush(newcrush);
10485
10486     string name;
10487     cmd_getval(cmdmap, "name", name);
10488     if (!newcrush.name_exists(name)) {
10489       err = -ENOENT;
10490       ss << "device '" << name << "' does not appear in the crush map";
10491       goto reply;
10492     }
10493
10494     int id = newcrush.get_item_id(name);
10495     if (id >= 0) {
10496       ss << "device '" << name << "' is not a subtree in the crush map";
10497       err = -EINVAL;
10498       goto reply;
10499     }
10500     double w;
10501     if (!cmd_getval(cmdmap, "weight", w)) {
10502       ss << "unable to parse weight value '"
10503          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10504       err = -EINVAL;
10505       goto reply;
10506     }
10507
10508     err = newcrush.adjust_subtree_weightf(cct, id, w,
10509                                           g_conf()->osd_crush_update_weight_set);
10510     if (err < 0)
10511       goto reply;
10512     pending_inc.crush.clear();
10513     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10514     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10515        << " in crush map";
10516     getline(ss, rs);
10517     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10518                                               get_last_committed() + 1));
10519     return true;
10520   } else if (prefix == "osd crush tunables") {
10521     CrushWrapper newcrush;
10522     _get_pending_crush(newcrush);
10523
10524     err = 0;
10525     string profile;
10526     cmd_getval(cmdmap, "profile", profile);
10527     if (profile == "legacy" || profile == "argonaut") {
10528       newcrush.set_tunables_legacy();
10529     } else if (profile == "bobtail") {
10530       newcrush.set_tunables_bobtail();
10531     } else if (profile == "firefly") {
10532       newcrush.set_tunables_firefly();
10533     } else if (profile == "hammer") {
10534       newcrush.set_tunables_hammer();
10535     } else if (profile == "jewel") {
10536       newcrush.set_tunables_jewel();
10537     } else if (profile == "optimal") {
10538       newcrush.set_tunables_optimal();
10539     } else if (profile == "default") {
10540       newcrush.set_tunables_default();
10541     } else {
10542       ss << "unrecognized profile '" << profile << "'";
10543       err = -EINVAL;
10544       goto reply;
10545     }
10546
10547     if (!validate_crush_against_features(&newcrush, ss)) {
10548       err = -EINVAL;
10549       goto reply;
10550     }
10551
10552     pending_inc.crush.clear();
10553     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10554     ss << "adjusted tunables profile to " << profile;
10555     getline(ss, rs);
10556     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10557                                               get_last_committed() + 1));
10558     return true;
10559   } else if (prefix == "osd crush set-tunable") {
10560     CrushWrapper newcrush;
10561     _get_pending_crush(newcrush);
10562
10563     err = 0;
10564     string tunable;
10565     cmd_getval(cmdmap, "tunable", tunable);
10566
10567     int64_t value = -1;
10568     if (!cmd_getval(cmdmap, "value", value)) {
10569       err = -EINVAL;
10570       ss << "failed to parse integer value "
10571          << cmd_vartype_stringify(cmdmap.at("value"));
10572       goto reply;
10573     }
10574
10575     if (tunable == "straw_calc_version") {
10576       if (value != 0 && value != 1) {
10577         ss << "value must be 0 or 1; got " << value;
10578         err = -EINVAL;
10579         goto reply;
10580       }
10581       newcrush.set_straw_calc_version(value);
10582     } else {
10583       ss << "unrecognized tunable '" << tunable << "'";
10584       err = -EINVAL;
10585       goto reply;
10586     }
10587
10588     if (!validate_crush_against_features(&newcrush, ss)) {
10589       err = -EINVAL;
10590       goto reply;
10591     }
10592
10593     pending_inc.crush.clear();
10594     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10595     ss << "adjusted tunable " << tunable << " to " << value;
10596     getline(ss, rs);
10597     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10598                                               get_last_committed() + 1));
10599     return true;
10600
10601   } else if (prefix == "osd crush rule create-simple") {
10602     string name, root, type, mode;
10603     cmd_getval(cmdmap, "name", name);
10604     cmd_getval(cmdmap, "root", root);
10605     cmd_getval(cmdmap, "type", type);
10606     cmd_getval(cmdmap, "mode", mode);
10607     if (mode == "")
10608       mode = "firstn";
10609
10610     if (osdmap.crush->rule_exists(name)) {
10611       // The name is uniquely associated to a ruleid and the rule it contains
10612       // From the user point of view, the rule is more meaningfull.
10613       ss << "rule " << name << " already exists";
10614       err = 0;
10615       goto reply;
10616     }
10617
10618     CrushWrapper newcrush;
10619     _get_pending_crush(newcrush);
10620
10621     if (newcrush.rule_exists(name)) {
10622       // The name is uniquely associated to a ruleid and the rule it contains
10623       // From the user point of view, the rule is more meaningfull.
10624       ss << "rule " << name << " already exists";
10625       err = 0;
10626     } else {
10627       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10628                                                pg_pool_t::TYPE_REPLICATED, &ss);
10629       if (ruleno < 0) {
10630         err = ruleno;
10631         goto reply;
10632       }
10633
10634       pending_inc.crush.clear();
10635       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10636     }
10637     getline(ss, rs);
10638     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10639                                               get_last_committed() + 1));
10640     return true;
10641
10642   } else if (prefix == "osd crush rule create-replicated") {
10643     string name, root, type, device_class;
10644     cmd_getval(cmdmap, "name", name);
10645     cmd_getval(cmdmap, "root", root);
10646     cmd_getval(cmdmap, "type", type);
10647     cmd_getval(cmdmap, "class", device_class);
10648
10649     if (osdmap.crush->rule_exists(name)) {
10650       // The name is uniquely associated to a ruleid and the rule it contains
10651       // From the user point of view, the rule is more meaningfull.
10652       ss << "rule " << name << " already exists";
10653       err = 0;
10654       goto reply;
10655     }
10656
10657     CrushWrapper newcrush;
10658     _get_pending_crush(newcrush);
10659
10660     if (newcrush.rule_exists(name)) {
10661       // The name is uniquely associated to a ruleid and the rule it contains
10662       // From the user point of view, the rule is more meaningfull.
10663       ss << "rule " << name << " already exists";
10664       err = 0;
10665     } else {
10666       int ruleno = newcrush.add_simple_rule(
10667         name, root, type, device_class,
10668         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10669       if (ruleno < 0) {
10670         err = ruleno;
10671         goto reply;
10672       }
10673
10674       pending_inc.crush.clear();
10675       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10676     }
10677     getline(ss, rs);
10678     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10679                                               get_last_committed() + 1));
10680     return true;
10681
10682   } else if (prefix == "osd erasure-code-profile rm") {
10683     string name;
10684     cmd_getval(cmdmap, "name", name);
10685
10686     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10687       goto wait;
10688
10689     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10690       err = -EBUSY;
10691       goto reply;
10692     }
10693
10694     if (osdmap.has_erasure_code_profile(name) ||
10695         pending_inc.new_erasure_code_profiles.count(name)) {
10696       if (osdmap.has_erasure_code_profile(name)) {
10697         pending_inc.old_erasure_code_profiles.push_back(name);
10698       } else {
10699         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10700         pending_inc.new_erasure_code_profiles.erase(name);
10701       }
10702
10703       getline(ss, rs);
10704       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10705                                                         get_last_committed() + 1));
10706       return true;
10707     } else {
10708       ss << "erasure-code-profile " << name << " does not exist";
10709       err = 0;
10710       goto reply;
10711     }
10712
10713   } else if (prefix == "osd erasure-code-profile set") {
10714     string name;
10715     cmd_getval(cmdmap, "name", name);
10716     vector<string> profile;
10717     cmd_getval(cmdmap, "profile", profile);
10718
10719     bool force = false;
10720     cmd_getval(cmdmap, "force", force);
10721
10722     map<string,string> profile_map;
10723     err = parse_erasure_code_profile(profile, &profile_map, &ss);
10724     if (err)
10725       goto reply;
10726     if (profile_map.find("plugin") == profile_map.end()) {
10727       ss << "erasure-code-profile " << profile_map
10728          << " must contain a plugin entry" << std::endl;
10729       err = -EINVAL;
10730       goto reply;
10731     }
10732     string plugin = profile_map["plugin"];
10733
10734     if (pending_inc.has_erasure_code_profile(name)) {
10735       dout(20) << "erasure code profile " << name << " try again" << dendl;
10736       goto wait;
10737     } else {
10738       err = normalize_profile(name, profile_map, force, &ss);
10739       if (err)
10740         goto reply;
10741
10742       if (osdmap.has_erasure_code_profile(name)) {
10743         ErasureCodeProfile existing_profile_map =
10744           osdmap.get_erasure_code_profile(name);
10745         err = normalize_profile(name, existing_profile_map, force, &ss);
10746         if (err)
10747           goto reply;
10748
10749         if (existing_profile_map == profile_map) {
10750           err = 0;
10751           goto reply;
10752         }
10753         if (!force) {
10754           err = -EPERM;
10755           ss << "will not override erasure code profile " << name
10756              << " because the existing profile "
10757              << existing_profile_map
10758              << " is different from the proposed profile "
10759              << profile_map;
10760           goto reply;
10761         }
10762       }
10763
10764       dout(20) << "erasure code profile set " << name << "="
10765                << profile_map << dendl;
10766       pending_inc.set_erasure_code_profile(name, profile_map);
10767     }
10768
10769     getline(ss, rs);
10770     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10771                                                       get_last_committed() + 1));
10772     return true;
10773
10774   } else if (prefix == "osd crush rule create-erasure") {
10775     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
10776     if (err == -EAGAIN)
10777       goto wait;
10778     if (err)
10779       goto reply;
10780     string name, poolstr;
10781     cmd_getval(cmdmap, "name", name);
10782     string profile;
10783     cmd_getval(cmdmap, "profile", profile);
10784     if (profile == "")
10785       profile = "default";
10786     if (profile == "default") {
10787       if (!osdmap.has_erasure_code_profile(profile)) {
10788         if (pending_inc.has_erasure_code_profile(profile)) {
10789           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
10790           goto wait;
10791         }
10792
10793         map<string,string> profile_map;
10794         err = osdmap.get_erasure_code_profile_default(cct,
10795                                                       profile_map,
10796                                                       &ss);
10797         if (err)
10798           goto reply;
10799         err = normalize_profile(name, profile_map, true, &ss);
10800         if (err)
10801           goto reply;
10802         dout(20) << "erasure code profile set " << profile << "="
10803                  << profile_map << dendl;
10804         pending_inc.set_erasure_code_profile(profile, profile_map);
10805         goto wait;
10806       }
10807     }
10808
10809     int rule;
10810     err = crush_rule_create_erasure(name, profile, &rule, &ss);
10811     if (err < 0) {
10812       switch(err) {
10813       case -EEXIST: // return immediately
10814         ss << "rule " << name << " already exists";
10815         err = 0;
10816         goto reply;
10817         break;
10818       case -EALREADY: // wait for pending to be proposed
10819         ss << "rule " << name << " already exists";
10820         err = 0;
10821         break;
10822       default: // non recoverable error
10823         goto reply;
10824         break;
10825       }
10826     } else {
10827       ss << "created rule " << name << " at " << rule;
10828     }
10829
10830     getline(ss, rs);
10831     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10832                                                       get_last_committed() + 1));
10833     return true;
10834
10835   } else if (prefix == "osd crush rule rm") {
10836     string name;
10837     cmd_getval(cmdmap, "name", name);
10838
10839     if (!osdmap.crush->rule_exists(name)) {
10840       ss << "rule " << name << " does not exist";
10841       err = 0;
10842       goto reply;
10843     }
10844
10845     CrushWrapper newcrush;
10846     _get_pending_crush(newcrush);
10847
10848     if (!newcrush.rule_exists(name)) {
10849       ss << "rule " << name << " does not exist";
10850       err = 0;
10851     } else {
10852       int ruleno = newcrush.get_rule_id(name);
10853       ceph_assert(ruleno >= 0);
10854
10855       // make sure it is not in use.
10856       // FIXME: this is ok in some situations, but let's not bother with that
10857       // complexity now.
10858       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
10859       if (osdmap.crush_rule_in_use(ruleset)) {
10860         ss << "crush ruleset " << name << " " << ruleset << " is in use";
10861         err = -EBUSY;
10862         goto reply;
10863       }
10864
10865       err = newcrush.remove_rule(ruleno);
10866       if (err < 0) {
10867         goto reply;
10868       }
10869
10870       pending_inc.crush.clear();
10871       newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10872     }
10873     getline(ss, rs);
10874     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10875                                               get_last_committed() + 1));
10876     return true;
10877
10878   } else if (prefix == "osd crush rule rename") {
10879     string srcname;
10880     string dstname;
10881     cmd_getval(cmdmap, "srcname", srcname);
10882     cmd_getval(cmdmap, "dstname", dstname);
10883     if (srcname.empty() || dstname.empty()) {
10884       ss << "must specify both source rule name and destination rule name";
10885       err = -EINVAL;
10886       goto reply;
10887     }
10888     if (srcname == dstname) {
10889       ss << "destination rule name is equal to source rule name";
10890       err = 0;
10891       goto reply;
10892     }
10893
10894     CrushWrapper newcrush;
10895     _get_pending_crush(newcrush);
10896     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
10897       // srcname does not exist and dstname already exists
10898       // suppose this is a replay and return success
10899       // (so this command is idempotent)
10900       ss << "already renamed to '" << dstname << "'";
10901       err = 0;
10902       goto reply;
10903     }
10904
10905     err = newcrush.rename_rule(srcname, dstname, &ss);
10906     if (err < 0) {
10907       // ss has reason for failure
10908       goto reply;
10909     }
10910     pending_inc.crush.clear();
10911     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
10912     getline(ss, rs);
10913     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10914                                get_last_committed() + 1));
10915     return true;
10916
10917   } else if (prefix == "osd setmaxosd") {
10918     int64_t newmax;
10919     if (!cmd_getval(cmdmap, "newmax", newmax)) {
10920       ss << "unable to parse 'newmax' value '"
10921          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
10922       err = -EINVAL;
10923       goto reply;
10924     }
10925
10926     if (newmax > g_conf()->mon_max_osd) {
10927       err = -ERANGE;
10928       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
10929          << g_conf()->mon_max_osd << ")";
10930       goto reply;
10931     }
10932
10933     // Don't allow shrinking OSD number as this will cause data loss
10934     // and may cause kernel crashes.
10935     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
10936     if (newmax < osdmap.get_max_osd()) {
10937       // Check if the OSDs exist between current max and new value.
10938       // If there are any OSDs exist, then don't allow shrinking number
10939       // of OSDs.
10940       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
10941         if (osdmap.exists(i)) {
10942           err = -EBUSY;
10943           ss << "cannot shrink max_osd to " << newmax
10944              << " because osd." << i << " (and possibly others) still in use";
10945           goto reply;
10946         }
10947       }
10948     }
10949
10950     pending_inc.new_max_osd = newmax;
10951     ss << "set new max_osd = " << pending_inc.new_max_osd;
10952     getline(ss, rs);
10953     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10954                                               get_last_committed() + 1));
10955     return true;
10956
10957   } else if (prefix == "osd set-full-ratio" ||
10958              prefix == "osd set-backfillfull-ratio" ||
10959              prefix == "osd set-nearfull-ratio") {
10960     double n;
10961     if (!cmd_getval(cmdmap, "ratio", n)) {
10962       ss << "unable to parse 'ratio' value '"
10963          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
10964       err = -EINVAL;
10965       goto reply;
10966     }
10967     if (prefix == "osd set-full-ratio")
10968       pending_inc.new_full_ratio = n;
10969     else if (prefix == "osd set-backfillfull-ratio")
10970       pending_inc.new_backfillfull_ratio = n;
10971     else if (prefix == "osd set-nearfull-ratio")
10972       pending_inc.new_nearfull_ratio = n;
10973     ss << prefix << " " << n;
10974     getline(ss, rs);
10975     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10976                                               get_last_committed() + 1));
10977     return true;
10978   } else if (prefix == "osd set-require-min-compat-client") {
10979     string v;
10980     cmd_getval(cmdmap, "version", v);
10981     ceph_release_t vno = ceph_release_from_name(v);
10982     if (!vno) {
10983       ss << "version " << v << " is not recognized";
10984       err = -EINVAL;
10985       goto reply;
10986     }
10987     OSDMap newmap;
10988     newmap.deepish_copy_from(osdmap);
10989     newmap.apply_incremental(pending_inc);
10990     newmap.require_min_compat_client = vno;
10991     auto mvno = newmap.get_min_compat_client();
10992     if (vno < mvno) {
10993       ss << "osdmap current utilizes features that require " << mvno
10994          << "; cannot set require_min_compat_client below that to " << vno;
10995       err = -EPERM;
10996       goto reply;
10997     }
10998     bool sure = false;
10999     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11000     if (!sure) {
11001       FeatureMap m;
11002       mon->get_combined_feature_map(&m);
11003       uint64_t features = ceph_release_features(ceph::to_integer<int>(vno));
11004       bool first = true;
11005       bool ok = true;
11006       for (int type : {
11007             CEPH_ENTITY_TYPE_CLIENT,
11008             CEPH_ENTITY_TYPE_MDS,
11009             CEPH_ENTITY_TYPE_MGR }) {
11010         auto p = m.m.find(type);
11011         if (p == m.m.end()) {
11012           continue;
11013         }
11014         for (auto& q : p->second) {
11015           uint64_t missing = ~q.first & features;
11016           if (missing) {
11017             if (first) {
11018               ss << "cannot set require_min_compat_client to " << v << ": ";
11019             } else {
11020               ss << "; ";
11021             }
11022             first = false;
11023             ss << q.second << " connected " << ceph_entity_type_name(type)
11024                << "(s) look like " << ceph_release_name(
11025                  ceph_release_from_features(q.first))
11026                << " (missing 0x" << std::hex << missing << std::dec << ")";
11027             ok = false;
11028           }
11029         }
11030       }
11031       if (!ok) {
11032         ss << "; add --yes-i-really-mean-it to do it anyway";
11033         err = -EPERM;
11034         goto reply;
11035       }
11036     }
11037     ss << "set require_min_compat_client to " << vno;
11038     pending_inc.new_require_min_compat_client = vno;
11039     getline(ss, rs);
11040     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11041                                                           get_last_committed() + 1));
11042     return true;
11043   } else if (prefix == "osd pause") {
11044     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11045
11046   } else if (prefix == "osd unpause") {
11047     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11048
11049   } else if (prefix == "osd set") {
11050     bool sure = false;
11051     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11052
11053     string key;
11054     cmd_getval(cmdmap, "key", key);
11055     if (key == "pause")
11056       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11057     else if (key == "noup")
11058       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11059     else if (key == "nodown")
11060       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11061     else if (key == "noout")
11062       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11063     else if (key == "noin")
11064       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11065     else if (key == "nobackfill")
11066       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11067     else if (key == "norebalance")
11068       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11069     else if (key == "norecover")
11070       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11071     else if (key == "noscrub")
11072       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11073     else if (key == "nodeep-scrub")
11074       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11075     else if (key == "notieragent")
11076       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11077     else if (key == "nosnaptrim")
11078       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11079     else if (key == "pglog_hardlimit") {
11080       if (!osdmap.get_num_up_osds() && !sure) {
11081         ss << "Not advisable to continue since no OSDs are up. Pass "
11082            << "--yes-i-really-mean-it if you really wish to continue.";
11083         err = -EPERM;
11084         goto reply;
11085       }
11086       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11087       // we are reusing a jewel feature bit that was retired in luminous.
11088       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11089          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11090           || sure)) {
11091         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11092       } else {
11093         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11094         err = -EPERM;
11095         goto reply;
11096       }
11097     } else {
11098       ss << "unrecognized flag '" << key << "'";
11099       err = -EINVAL;
11100     }
11101
11102   } else if (prefix == "osd unset") {
11103     string key;
11104     cmd_getval(cmdmap, "key", key);
11105     if (key == "pause")
11106       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11107     else if (key == "noup")
11108       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11109     else if (key == "nodown")
11110       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11111     else if (key == "noout")
11112       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11113     else if (key == "noin")
11114       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11115     else if (key == "nobackfill")
11116       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11117     else if (key == "norebalance")
11118       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11119     else if (key == "norecover")
11120       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11121     else if (key == "noscrub")
11122       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11123     else if (key == "nodeep-scrub")
11124       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11125     else if (key == "notieragent")
11126       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11127     else if (key == "nosnaptrim")
11128       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11129     else {
11130       ss << "unrecognized flag '" << key << "'";
11131       err = -EINVAL;
11132     }
11133
11134   } else if (prefix == "osd require-osd-release") {
11135     string release;
11136     cmd_getval(cmdmap, "release", release);
11137     bool sure = false;
11138     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11139     ceph_release_t rel = ceph_release_from_name(release.c_str());
11140     if (!rel) {
11141       ss << "unrecognized release " << release;
11142       err = -EINVAL;
11143       goto reply;
11144     }
11145     if (rel == osdmap.require_osd_release) {
11146       // idempotent
11147       err = 0;
11148       goto reply;
11149     }
11150     ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11151     if (!osdmap.get_num_up_osds() && !sure) {
11152       ss << "Not advisable to continue since no OSDs are up. Pass "
11153          << "--yes-i-really-mean-it if you really wish to continue.";
11154       err = -EPERM;
11155       goto reply;
11156     }
11157     if (rel == ceph_release_t::mimic) {
11158       if (!mon->monmap->get_required_features().contains_all(
11159             ceph::features::mon::FEATURE_MIMIC)) {
11160         ss << "not all mons are mimic";
11161         err = -EPERM;
11162         goto reply;
11163       }
11164       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11165            && !sure) {
11166         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11167         err = -EPERM;
11168         goto reply;
11169       }
11170     } else if (rel == ceph_release_t::nautilus) {
11171       if (!mon->monmap->get_required_features().contains_all(
11172             ceph::features::mon::FEATURE_NAUTILUS)) {
11173         ss << "not all mons are nautilus";
11174         err = -EPERM;
11175         goto reply;
11176       }
11177       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11178            && !sure) {
11179         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11180         err = -EPERM;
11181         goto reply;
11182       }
11183     } else if (rel == ceph_release_t::octopus) {
11184       if (!mon->monmap->get_required_features().contains_all(
11185             ceph::features::mon::FEATURE_OCTOPUS)) {
11186         ss << "not all mons are octopus";
11187         err = -EPERM;
11188         goto reply;
11189       }
11190       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11191            && !sure) {
11192         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11193         err = -EPERM;
11194         goto reply;
11195       }
11196     } else {
11197       ss << "not supported for this release yet";
11198       err = -EPERM;
11199       goto reply;
11200     }
11201     if (rel < osdmap.require_osd_release) {
11202       ss << "require_osd_release cannot be lowered once it has been set";
11203       err = -EPERM;
11204       goto reply;
11205     }
11206     pending_inc.new_require_osd_release = rel;
11207     goto update;
11208   } else if (prefix == "osd down" ||
11209              prefix == "osd out" ||
11210              prefix == "osd in" ||
11211              prefix == "osd rm" ||
11212              prefix == "osd stop") {
11213
11214     bool any = false;
11215     bool stop = false;
11216     bool verbose = true;
11217     bool definitely_dead = false;
11218
11219     vector<string> idvec;
11220     cmd_getval(cmdmap, "ids", idvec);
11221     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11222     derr << "definitely_dead " << (int)definitely_dead << dendl;
11223     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11224       set<int> osds;
11225
11226       // wildcard?
11227       if (j == 0 &&
11228           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11229         if (prefix == "osd in") {
11230           // touch out osds only
11231           osdmap.get_out_existing_osds(osds);
11232         } else {
11233           osdmap.get_all_osds(osds);
11234         }
11235         stop = true;
11236         verbose = false; // so the output is less noisy.
11237       } else {
11238         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11239         if (osd < 0) {
11240           ss << "invalid osd id" << osd;
11241           err = -EINVAL;
11242           continue;
11243         } else if (!osdmap.exists(osd)) {
11244           ss << "osd." << osd << " does not exist. ";
11245           continue;
11246         }
11247
11248         osds.insert(osd);
11249       }
11250
11251       for (auto &osd : osds) {
11252         if (prefix == "osd down") {
11253           if (osdmap.is_down(osd)) {
11254             if (verbose)
11255               ss << "osd." << osd << " is already down. ";
11256           } else {
11257             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11258             ss << "marked down osd." << osd << ". ";
11259             any = true;
11260           }
11261           if (definitely_dead) {
11262             if (!pending_inc.new_xinfo.count(osd)) {
11263               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11264             }
11265             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11266               any = true;
11267             }
11268             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11269           }
11270         } else if (prefix == "osd out") {
11271           if (osdmap.is_out(osd)) {
11272             if (verbose)
11273               ss << "osd." << osd << " is already out. ";
11274           } else {
11275             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11276             if (osdmap.osd_weight[osd]) {
11277               if (pending_inc.new_xinfo.count(osd) == 0) {
11278                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11279               }
11280               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11281             }
11282             ss << "marked out osd." << osd << ". ";
11283             std::ostringstream msg;
11284             msg << "Client " << op->get_session()->entity_name
11285                 << " marked osd." << osd << " out";
11286             if (osdmap.is_up(osd)) {
11287               msg << ", while it was still marked up";
11288             } else {
11289               auto period = ceph_clock_now() - down_pending_out[osd];
11290               msg << ", after it was down for " << int(period.sec())
11291                   << " seconds";
11292             }
11293
11294             mon->clog->info() << msg.str();
11295             any = true;
11296           }
11297         } else if (prefix == "osd in") {
11298           if (osdmap.is_in(osd)) {
11299             if (verbose)
11300               ss << "osd." << osd << " is already in. ";
11301           } else {
11302             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11303               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11304               if (pending_inc.new_xinfo.count(osd) == 0) {
11305                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11306               }
11307               pending_inc.new_xinfo[osd].old_weight = 0;
11308             } else {
11309               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11310             }
11311             ss << "marked in osd." << osd << ". ";
11312             any = true;
11313           }
11314         } else if (prefix == "osd rm") {
11315           err = prepare_command_osd_remove(osd);
11316
11317           if (err == -EBUSY) {
11318             if (any)
11319               ss << ", ";
11320             ss << "osd." << osd << " is still up; must be down before removal. ";
11321           } else {
11322             ceph_assert(err == 0);
11323             if (any) {
11324               ss << ", osd." << osd;
11325             } else {
11326               ss << "removed osd." << osd;
11327             }
11328             any = true;
11329           }
11330         } else if (prefix == "osd stop") {
11331           if (osdmap.is_stop(osd)) {
11332             if (verbose)
11333               ss << "osd." << osd << " is already stopped. ";
11334           } else if (osdmap.is_down(osd)) {
11335             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11336             ss << "stop down osd." << osd << ". ";
11337             any = true;
11338           } else {
11339             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11340             ss << "stop osd." << osd << ". ";
11341             any = true;
11342           }
11343         }
11344       }
11345     }
11346     if (any) {
11347       getline(ss, rs);
11348       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11349                                                 get_last_committed() + 1));
11350       return true;
11351     }
11352   } else if (prefix == "osd set-group" ||
11353              prefix == "osd unset-group" ||
11354              prefix == "osd add-noup" ||
11355              prefix == "osd add-nodown" ||
11356              prefix == "osd add-noin" ||
11357              prefix == "osd add-noout" ||
11358              prefix == "osd rm-noup" ||
11359              prefix == "osd rm-nodown" ||
11360              prefix == "osd rm-noin" ||
11361              prefix == "osd rm-noout") {
11362     bool do_set = prefix == "osd set-group" ||
11363                   prefix.find("add") != string::npos;
11364     string flag_str;
11365     unsigned flags = 0;
11366     vector<string> who;
11367     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11368       cmd_getval(cmdmap, "flags", flag_str);
11369       cmd_getval(cmdmap, "who", who);
11370       vector<string> raw_flags;
11371       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11372       for (auto& f : raw_flags) {
11373         if (f == "noup")
11374           flags |= CEPH_OSD_NOUP;
11375         else if (f == "nodown")
11376           flags |= CEPH_OSD_NODOWN;
11377         else if (f == "noin")
11378           flags |= CEPH_OSD_NOIN;
11379         else if (f == "noout")
11380           flags |= CEPH_OSD_NOOUT;
11381         else {
11382           ss << "unrecognized flag '" << f << "', must be one of "
11383              << "{noup,nodown,noin,noout}";
11384           err = -EINVAL;
11385           goto reply;
11386         }
11387       }
11388     } else {
11389       cmd_getval(cmdmap, "ids", who);
11390       if (prefix.find("noup") != string::npos)
11391         flags = CEPH_OSD_NOUP;
11392       else if (prefix.find("nodown") != string::npos)
11393         flags = CEPH_OSD_NODOWN;
11394       else if (prefix.find("noin") != string::npos)
11395         flags = CEPH_OSD_NOIN;
11396       else if (prefix.find("noout") != string::npos)
11397         flags = CEPH_OSD_NOOUT;
11398       else
11399         ceph_assert(0 == "Unreachable!");
11400     }
11401     if (flags == 0) {
11402       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11403       err = -EINVAL;
11404       goto reply;
11405     }
11406     if (who.empty()) {
11407       ss << "must specify at least one or more targets to set/unset";
11408       err = -EINVAL;
11409       goto reply;
11410     }
11411     set<int> osds;
11412     set<int> crush_nodes;
11413     set<int> device_classes;
11414     for (auto& w : who) {
11415       if (w == "any" || w == "all" || w == "*") {
11416         osdmap.get_all_osds(osds);
11417         break;
11418       }
11419       std::stringstream ts;
11420       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11421         osds.insert(osd);
11422       } else if (osdmap.crush->name_exists(w)) {
11423         crush_nodes.insert(osdmap.crush->get_item_id(w));
11424       } else if (osdmap.crush->class_exists(w)) {
11425         device_classes.insert(osdmap.crush->get_class_id(w));
11426       } else {
11427         ss << "unable to parse osd id or crush node or device class: "
11428            << "\"" << w << "\". ";
11429       }
11430     }
11431     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11432       // ss has reason for failure
11433       err = -EINVAL;
11434       goto reply;
11435     }
11436     bool any = false;
11437     for (auto osd : osds) {
11438       if (!osdmap.exists(osd)) {
11439         ss << "osd." << osd << " does not exist. ";
11440         continue;
11441       }
11442       if (do_set) {
11443         if (flags & CEPH_OSD_NOUP) {
11444           any |= osdmap.is_noup_by_osd(osd) ?
11445             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11446             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11447         }
11448         if (flags & CEPH_OSD_NODOWN) {
11449           any |= osdmap.is_nodown_by_osd(osd) ?
11450             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11451             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11452         }
11453         if (flags & CEPH_OSD_NOIN) {
11454           any |= osdmap.is_noin_by_osd(osd) ?
11455             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11456             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11457         }
11458         if (flags & CEPH_OSD_NOOUT) {
11459           any |= osdmap.is_noout_by_osd(osd) ?
11460             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11461             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11462         }
11463       } else {
11464         if (flags & CEPH_OSD_NOUP) {
11465           any |= osdmap.is_noup_by_osd(osd) ?
11466             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11467             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11468         }
11469         if (flags & CEPH_OSD_NODOWN) {
11470           any |= osdmap.is_nodown_by_osd(osd) ?
11471             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11472             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11473         }
11474         if (flags & CEPH_OSD_NOIN) {
11475           any |= osdmap.is_noin_by_osd(osd) ?
11476             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11477             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11478         }
11479         if (flags & CEPH_OSD_NOOUT) {
11480           any |= osdmap.is_noout_by_osd(osd) ?
11481             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11482             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11483         }
11484       }
11485     }
11486     for (auto& id : crush_nodes) {
11487       auto old_flags = osdmap.get_crush_node_flags(id);
11488       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11489       pending_flags |= old_flags; // adopt existing flags first!
11490       if (do_set) {
11491         pending_flags |= flags;
11492       } else {
11493         pending_flags &= ~flags;
11494       }
11495       any = true;
11496     }
11497     for (auto& id : device_classes) {
11498       auto old_flags = osdmap.get_device_class_flags(id);
11499       auto& pending_flags = pending_inc.new_device_class_flags[id];
11500       pending_flags |= old_flags;
11501       if (do_set) {
11502         pending_flags |= flags;
11503       } else {
11504         pending_flags &= ~flags;
11505       }
11506       any = true;
11507     }
11508     if (any) {
11509       getline(ss, rs);
11510       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11511                                  get_last_committed() + 1));
11512       return true;
11513     }
11514   } else if (prefix == "osd pg-temp") {
11515     string pgidstr;
11516     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11517       ss << "unable to parse 'pgid' value '"
11518          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11519       err = -EINVAL;
11520       goto reply;
11521     }
11522     pg_t pgid;
11523     if (!pgid.parse(pgidstr.c_str())) {
11524       ss << "invalid pgid '" << pgidstr << "'";
11525       err = -EINVAL;
11526       goto reply;
11527     }
11528     if (!osdmap.pg_exists(pgid)) {
11529       ss << "pg " << pgid << " does not exist";
11530       err = -ENOENT;
11531       goto reply;
11532     }
11533     if (pending_inc.new_pg_temp.count(pgid)) {
11534       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11535       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11536       return true;
11537     }
11538
11539     vector<int64_t> id_vec;
11540     vector<int32_t> new_pg_temp;
11541     cmd_getval(cmdmap, "id", id_vec);
11542     if (id_vec.empty())  {
11543       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11544       ss << "done cleaning up pg_temp of " << pgid;
11545       goto update;
11546     }
11547     for (auto osd : id_vec) {
11548       if (!osdmap.exists(osd)) {
11549         ss << "osd." << osd << " does not exist";
11550         err = -ENOENT;
11551         goto reply;
11552       }
11553       new_pg_temp.push_back(osd);
11554     }
11555
11556     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11557     if ((int)new_pg_temp.size() < pool_min_size) {
11558       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11559          << pool_min_size << ")";
11560       err = -EINVAL;
11561       goto reply;
11562     }
11563
11564     int pool_size = osdmap.get_pg_pool_size(pgid);
11565     if ((int)new_pg_temp.size() > pool_size) {
11566       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11567          << pool_size << ")";
11568       err = -EINVAL;
11569       goto reply;
11570     }
11571
11572     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11573       new_pg_temp.begin(), new_pg_temp.end());
11574     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11575     goto update;
11576   } else if (prefix == "osd primary-temp") {
11577     string pgidstr;
11578     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11579       ss << "unable to parse 'pgid' value '"
11580          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11581       err = -EINVAL;
11582       goto reply;
11583     }
11584     pg_t pgid;
11585     if (!pgid.parse(pgidstr.c_str())) {
11586       ss << "invalid pgid '" << pgidstr << "'";
11587       err = -EINVAL;
11588       goto reply;
11589     }
11590     if (!osdmap.pg_exists(pgid)) {
11591       ss << "pg " << pgid << " does not exist";
11592       err = -ENOENT;
11593       goto reply;
11594     }
11595
11596     int64_t osd;
11597     if (!cmd_getval(cmdmap, "id", osd)) {
11598       ss << "unable to parse 'id' value '"
11599          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11600       err = -EINVAL;
11601       goto reply;
11602     }
11603     if (osd != -1 && !osdmap.exists(osd)) {
11604       ss << "osd." << osd << " does not exist";
11605       err = -ENOENT;
11606       goto reply;
11607     }
11608
11609     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11610         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11611       ss << "require_min_compat_client "
11612          << osdmap.require_min_compat_client
11613          << " < firefly, which is required for primary-temp";
11614       err = -EPERM;
11615       goto reply;
11616     }
11617
11618     pending_inc.new_primary_temp[pgid] = osd;
11619     ss << "set " << pgid << " primary_temp mapping to " << osd;
11620     goto update;
11621   } else if (prefix == "pg repeer") {
11622     pg_t pgid;
11623     string pgidstr;
11624     cmd_getval(cmdmap, "pgid", pgidstr);
11625     if (!pgid.parse(pgidstr.c_str())) {
11626       ss << "invalid pgid '" << pgidstr << "'";
11627       err = -EINVAL;
11628       goto reply;
11629     }
11630     if (!osdmap.pg_exists(pgid)) {
11631       ss << "pg '" << pgidstr << "' does not exist";
11632       err = -ENOENT;
11633       goto reply;
11634     }
11635     vector<int> acting;
11636     int primary;
11637     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11638     if (primary < 0) {
11639       err = -EAGAIN;
11640       ss << "pg currently has no primary";
11641       goto reply;
11642     }
11643     if (acting.size() > 1) {
11644       // map to just primary; it will map back to what it wants
11645       pending_inc.new_pg_temp[pgid] = { primary };
11646     } else {
11647       // hmm, pick another arbitrary osd to induce a change.  Note
11648       // that this won't work if there is only one suitable OSD in the cluster.
11649       int i;
11650       bool done = false;
11651       for (i = 0; i < osdmap.get_max_osd(); ++i) {
11652         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11653           continue;
11654         }
11655         pending_inc.new_pg_temp[pgid] = { primary, i };
11656         done = true;
11657         break;
11658       }
11659       if (!done) {
11660         err = -EAGAIN;
11661         ss << "not enough up OSDs in the cluster to force repeer";
11662         goto reply;
11663       }
11664     }
11665     goto update;
11666   } else if (prefix == "osd pg-upmap" ||
11667              prefix == "osd rm-pg-upmap" ||
11668              prefix == "osd pg-upmap-items" ||
11669              prefix == "osd rm-pg-upmap-items") {
11670     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11671       ss << "min_compat_client "
11672          << osdmap.require_min_compat_client
11673          << " < luminous, which is required for pg-upmap. "
11674          << "Try 'ceph osd set-require-min-compat-client luminous' "
11675          << "before using the new interface";
11676       err = -EPERM;
11677       goto reply;
11678     }
11679     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11680     if (err == -EAGAIN)
11681       goto wait;
11682     if (err < 0)
11683       goto reply;
11684     string pgidstr;
11685     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11686       ss << "unable to parse 'pgid' value '"
11687          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11688       err = -EINVAL;
11689       goto reply;
11690     }
11691     pg_t pgid;
11692     if (!pgid.parse(pgidstr.c_str())) {
11693       ss << "invalid pgid '" << pgidstr << "'";
11694       err = -EINVAL;
11695       goto reply;
11696     }
11697     if (!osdmap.pg_exists(pgid)) {
11698       ss << "pg " << pgid << " does not exist";
11699       err = -ENOENT;
11700       goto reply;
11701     }
11702     if (pending_inc.old_pools.count(pgid.pool())) {
11703       ss << "pool of " << pgid << " is pending removal";
11704       err = -ENOENT;
11705       getline(ss, rs);
11706       wait_for_finished_proposal(op,
11707         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
11708       return true;
11709     }
11710
11711     enum {
11712       OP_PG_UPMAP,
11713       OP_RM_PG_UPMAP,
11714       OP_PG_UPMAP_ITEMS,
11715       OP_RM_PG_UPMAP_ITEMS,
11716     } option;
11717
11718     if (prefix == "osd pg-upmap") {
11719       option = OP_PG_UPMAP;
11720     } else if (prefix == "osd rm-pg-upmap") {
11721       option = OP_RM_PG_UPMAP;
11722     } else if (prefix == "osd pg-upmap-items") {
11723       option = OP_PG_UPMAP_ITEMS;
11724     } else {
11725       option = OP_RM_PG_UPMAP_ITEMS;
11726     }
11727
11728     // check pending upmap changes
11729     switch (option) {
11730     case OP_PG_UPMAP: // fall through
11731     case OP_RM_PG_UPMAP:
11732       if (pending_inc.new_pg_upmap.count(pgid) ||
11733           pending_inc.old_pg_upmap.count(pgid)) {
11734         dout(10) << __func__ << " waiting for pending update on "
11735                  << pgid << dendl;
11736         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11737         return true;
11738       }
11739       break;
11740
11741     case OP_PG_UPMAP_ITEMS: // fall through
11742     case OP_RM_PG_UPMAP_ITEMS:
11743       if (pending_inc.new_pg_upmap_items.count(pgid) ||
11744           pending_inc.old_pg_upmap_items.count(pgid)) {
11745         dout(10) << __func__ << " waiting for pending update on "
11746                  << pgid << dendl;
11747         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11748         return true;
11749       }
11750       break;
11751
11752     default:
11753       ceph_abort_msg("invalid option");
11754     }
11755
11756     switch (option) {
11757     case OP_PG_UPMAP:
11758       {
11759         vector<int64_t> id_vec;
11760         if (!cmd_getval(cmdmap, "id", id_vec)) {
11761           ss << "unable to parse 'id' value(s) '"
11762              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11763           err = -EINVAL;
11764           goto reply;
11765         }
11766
11767         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11768         if ((int)id_vec.size() < pool_min_size) {
11769           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
11770              << pool_min_size << ")";
11771           err = -EINVAL;
11772           goto reply;
11773         }
11774
11775         int pool_size = osdmap.get_pg_pool_size(pgid);
11776         if ((int)id_vec.size() > pool_size) {
11777           ss << "num of osds (" << id_vec.size() <<") > pool size ("
11778              << pool_size << ")";
11779           err = -EINVAL;
11780           goto reply;
11781         }
11782
11783         vector<int32_t> new_pg_upmap;
11784         for (auto osd : id_vec) {
11785           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
11786             ss << "osd." << osd << " does not exist";
11787             err = -ENOENT;
11788             goto reply;
11789           }
11790           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
11791           if (it != new_pg_upmap.end()) {
11792             ss << "osd." << osd << " already exists, ";
11793             continue;
11794           }
11795           new_pg_upmap.push_back(osd);
11796         }
11797
11798         if (new_pg_upmap.empty()) {
11799           ss << "no valid upmap items(pairs) is specified";
11800           err = -EINVAL;
11801           goto reply;
11802         }
11803
11804         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
11805           new_pg_upmap.begin(), new_pg_upmap.end());
11806         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
11807       }
11808       break;
11809
11810     case OP_RM_PG_UPMAP:
11811       {
11812         pending_inc.old_pg_upmap.insert(pgid);
11813         ss << "clear " << pgid << " pg_upmap mapping";
11814       }
11815       break;
11816
11817     case OP_PG_UPMAP_ITEMS:
11818       {
11819         vector<int64_t> id_vec;
11820         if (!cmd_getval(cmdmap, "id", id_vec)) {
11821           ss << "unable to parse 'id' value(s) '"
11822              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11823           err = -EINVAL;
11824           goto reply;
11825         }
11826
11827         if (id_vec.size() % 2) {
11828           ss << "you must specify pairs of osd ids to be remapped";
11829           err = -EINVAL;
11830           goto reply;
11831         }
11832
11833         int pool_size = osdmap.get_pg_pool_size(pgid);
11834         if ((int)(id_vec.size() / 2) > pool_size) {
11835           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
11836              << pool_size << ")";
11837           err = -EINVAL;
11838           goto reply;
11839         }
11840
11841         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
11842         ostringstream items;
11843         items << "[";
11844         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
11845           int from = *p++;
11846           int to = *p;
11847           if (from == to) {
11848             ss << "from osd." << from << " == to osd." << to << ", ";
11849             continue;
11850           }
11851           if (!osdmap.exists(from)) {
11852             ss << "osd." << from << " does not exist";
11853             err = -ENOENT;
11854             goto reply;
11855           }
11856           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
11857             ss << "osd." << to << " does not exist";
11858             err = -ENOENT;
11859             goto reply;
11860           }
11861           pair<int32_t,int32_t> entry = make_pair(from, to);
11862           auto it = std::find(new_pg_upmap_items.begin(),
11863             new_pg_upmap_items.end(), entry);
11864           if (it != new_pg_upmap_items.end()) {
11865             ss << "osd." << from << " -> osd." << to << " already exists, ";
11866             continue;
11867           }
11868           new_pg_upmap_items.push_back(entry);
11869           items << from << "->" << to << ",";
11870         }
11871         string out(items.str());
11872         out.resize(out.size() - 1); // drop last ','
11873         out += "]";
11874
11875         if (new_pg_upmap_items.empty()) {
11876           ss << "no valid upmap items(pairs) is specified";
11877           err = -EINVAL;
11878           goto reply;
11879         }
11880
11881         pending_inc.new_pg_upmap_items[pgid] =
11882           mempool::osdmap::vector<pair<int32_t,int32_t>>(
11883           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
11884         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
11885       }
11886       break;
11887
11888     case OP_RM_PG_UPMAP_ITEMS:
11889       {
11890         pending_inc.old_pg_upmap_items.insert(pgid);
11891         ss << "clear " << pgid << " pg_upmap_items mapping";
11892       }
11893       break;
11894
11895     default:
11896       ceph_abort_msg("invalid option");
11897     }
11898
11899     goto update;
11900   } else if (prefix == "osd primary-affinity") {
11901     int64_t id;
11902     if (!cmd_getval(cmdmap, "id", id)) {
11903       ss << "invalid osd id value '"
11904          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11905       err = -EINVAL;
11906       goto reply;
11907     }
11908     double w;
11909     if (!cmd_getval(cmdmap, "weight", w)) {
11910       ss << "unable to parse 'weight' value '"
11911          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11912       err = -EINVAL;
11913       goto reply;
11914     }
11915     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
11916     if (ww < 0L) {
11917       ss << "weight must be >= 0";
11918       err = -EINVAL;
11919       goto reply;
11920     }
11921     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11922         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11923       ss << "require_min_compat_client "
11924          << osdmap.require_min_compat_client
11925          << " < firefly, which is required for primary-affinity";
11926       err = -EPERM;
11927       goto reply;
11928     }
11929     if (osdmap.exists(id)) {
11930       pending_inc.new_primary_affinity[id] = ww;
11931       ss << "set osd." << id << " primary-affinity to " << w << " (" << ios::hex << ww << ios::dec << ")";
11932       getline(ss, rs);
11933       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11934                                                 get_last_committed() + 1));
11935       return true;
11936     } else {
11937       ss << "osd." << id << " does not exist";
11938       err = -ENOENT;
11939       goto reply;
11940     }
11941   } else if (prefix == "osd reweight") {
11942     int64_t id;
11943     if (!cmd_getval(cmdmap, "id", id)) {
11944       ss << "unable to parse osd id value '"
11945          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11946       err = -EINVAL;
11947       goto reply;
11948     }
11949     double w;
11950     if (!cmd_getval(cmdmap, "weight", w)) {
11951       ss << "unable to parse weight value '"
11952          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11953       err = -EINVAL;
11954       goto reply;
11955     }
11956     long ww = (int)((double)CEPH_OSD_IN*w);
11957     if (ww < 0L) {
11958       ss << "weight must be >= 0";
11959       err = -EINVAL;
11960       goto reply;
11961     }
11962     if (osdmap.exists(id)) {
11963       pending_inc.new_weight[id] = ww;
11964       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
11965       getline(ss, rs);
11966       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11967                                                 get_last_committed() + 1));
11968       return true;
11969     } else {
11970       ss << "osd." << id << " does not exist";
11971       err = -ENOENT;
11972       goto reply;
11973     }
11974   } else if (prefix == "osd reweightn") {
11975     map<int32_t, uint32_t> weights;
11976     err = parse_reweights(cct, cmdmap, osdmap, &weights);
11977     if (err) {
11978       ss << "unable to parse 'weights' value '"
11979          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
11980       goto reply;
11981     }
11982     pending_inc.new_weight.insert(weights.begin(), weights.end());
11983     wait_for_finished_proposal(
11984         op,
11985         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
11986     return true;
11987   } else if (prefix == "osd lost") {
11988     int64_t id;
11989     if (!cmd_getval(cmdmap, "id", id)) {
11990       ss << "unable to parse osd id value '"
11991          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11992       err = -EINVAL;
11993       goto reply;
11994     }
11995     bool sure = false;
11996     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11997     if (!sure) {
11998       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
11999             "--yes-i-really-mean-it if you really do.";
12000       err = -EPERM;
12001       goto reply;
12002     } else if (!osdmap.exists(id)) {
12003       ss << "osd." << id << " does not exist";
12004       err = -ENOENT;
12005       goto reply;
12006     } else if (!osdmap.is_down(id)) {
12007       ss << "osd." << id << " is not down";
12008       err = -EBUSY;
12009       goto reply;
12010     } else {
12011       epoch_t e = osdmap.get_info(id).down_at;
12012       pending_inc.new_lost[id] = e;
12013       ss << "marked osd lost in epoch " << e;
12014       getline(ss, rs);
12015       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12016                                                 get_last_committed() + 1));
12017       return true;
12018     }
12019
12020   } else if (prefix == "osd destroy-actual" ||
12021              prefix == "osd purge-actual" ||
12022              prefix == "osd purge-new") {
12023     /* Destroying an OSD means that we don't expect to further make use of
12024      * the OSDs data (which may even become unreadable after this operation),
12025      * and that we are okay with scrubbing all its cephx keys and config-key
12026      * data (which may include lockbox keys, thus rendering the osd's data
12027      * unreadable).
12028      *
12029      * The OSD will not be removed. Instead, we will mark it as destroyed,
12030      * such that a subsequent call to `create` will not reuse the osd id.
12031      * This will play into being able to recreate the OSD, at the same
12032      * crush location, with minimal data movement.
12033      */
12034
12035     // make sure authmon is writeable.
12036     if (!mon->authmon()->is_writeable()) {
12037       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12038                << "osd destroy" << dendl;
12039       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12040       return false;
12041     }
12042
12043     int64_t id;
12044     if (!cmd_getval(cmdmap, "id", id)) {
12045       auto p = cmdmap.find("id");
12046       if (p == cmdmap.end()) {
12047         ss << "no osd id specified";
12048       } else {
12049         ss << "unable to parse osd id value '"
12050            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12051       }
12052       err = -EINVAL;
12053       goto reply;
12054     }
12055
12056     bool is_destroy = (prefix == "osd destroy-actual");
12057     if (!is_destroy) {
12058       ceph_assert("osd purge-actual" == prefix ||
12059              "osd purge-new" == prefix);
12060     }
12061
12062     bool sure = false;
12063     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12064     if (!sure) {
12065       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12066          << "This will mean real, permanent data loss, as well "
12067          << "as deletion of cephx and lockbox keys. "
12068          << "Pass --yes-i-really-mean-it if you really do.";
12069       err = -EPERM;
12070       goto reply;
12071     } else if (!osdmap.exists(id)) {
12072       ss << "osd." << id << " does not exist";
12073       err = 0; // idempotent
12074       goto reply;
12075     } else if (osdmap.is_up(id)) {
12076       ss << "osd." << id << " is not `down`.";
12077       err = -EBUSY;
12078       goto reply;
12079     } else if (is_destroy && osdmap.is_destroyed(id)) {
12080       ss << "destroyed osd." << id;
12081       err = 0;
12082       goto reply;
12083     }
12084
12085     if (prefix == "osd purge-new" &&
12086         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12087       ss << "osd." << id << " is not new";
12088       err = -EPERM;
12089       goto reply;
12090     }
12091
12092     bool goto_reply = false;
12093
12094     paxos->plug();
12095     if (is_destroy) {
12096       err = prepare_command_osd_destroy(id, ss);
12097       // we checked above that it should exist.
12098       ceph_assert(err != -ENOENT);
12099     } else {
12100       err = prepare_command_osd_purge(id, ss);
12101       if (err == -ENOENT) {
12102         err = 0;
12103         ss << "osd." << id << " does not exist.";
12104         goto_reply = true;
12105       }
12106     }
12107     paxos->unplug();
12108
12109     if (err < 0 || goto_reply) {
12110       goto reply;
12111     }
12112
12113     if (is_destroy) {
12114       ss << "destroyed osd." << id;
12115     } else {
12116       ss << "purged osd." << id;
12117     }
12118
12119     getline(ss, rs);
12120     wait_for_finished_proposal(op,
12121         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12122     force_immediate_propose();
12123     return true;
12124
12125   } else if (prefix == "osd new") {
12126
12127     // make sure authmon is writeable.
12128     if (!mon->authmon()->is_writeable()) {
12129       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12130                << "osd new" << dendl;
12131       mon->authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12132       return false;
12133     }
12134
12135     map<string,string> param_map;
12136
12137     bufferlist bl = m->get_data();
12138     string param_json = bl.to_str();
12139     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12140
12141     err = get_json_str_map(param_json, ss, &param_map);
12142     if (err < 0)
12143       goto reply;
12144
12145     dout(20) << __func__ << " osd new params " << param_map << dendl;
12146
12147     paxos->plug();
12148     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12149     paxos->unplug();
12150
12151     if (err < 0) {
12152       goto reply;
12153     }
12154
12155     if (f) {
12156       f->flush(rdata);
12157     } else {
12158       rdata.append(ss);
12159     }
12160
12161     if (err == EEXIST) {
12162       // idempotent operation
12163       err = 0;
12164       goto reply;
12165     }
12166
12167     wait_for_finished_proposal(op,
12168         new Monitor::C_Command(mon, op, 0, rs, rdata,
12169                                get_last_committed() + 1));
12170     force_immediate_propose();
12171     return true;
12172
12173   } else if (prefix == "osd create") {
12174
12175     // optional id provided?
12176     int64_t id = -1, cmd_id = -1;
12177     if (cmd_getval(cmdmap, "id", cmd_id)) {
12178       if (cmd_id < 0) {
12179         ss << "invalid osd id value '" << cmd_id << "'";
12180         err = -EINVAL;
12181         goto reply;
12182       }
12183       dout(10) << " osd create got id " << cmd_id << dendl;
12184     }
12185
12186     uuid_d uuid;
12187     string uuidstr;
12188     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12189       if (!uuid.parse(uuidstr.c_str())) {
12190         ss << "invalid uuid value '" << uuidstr << "'";
12191         err = -EINVAL;
12192         goto reply;
12193       }
12194       // we only care about the id if we also have the uuid, to
12195       // ensure the operation's idempotency.
12196       id = cmd_id;
12197     }
12198
12199     int32_t new_id = -1;
12200     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12201     if (err < 0) {
12202       if (err == -EAGAIN) {
12203         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12204         return true;
12205       }
12206       // a check has failed; reply to the user.
12207       goto reply;
12208
12209     } else if (err == EEXIST) {
12210       // this is an idempotent operation; we can go ahead and reply.
12211       if (f) {
12212         f->open_object_section("created_osd");
12213         f->dump_int("osdid", new_id);
12214         f->close_section();
12215         f->flush(rdata);
12216       } else {
12217         ss << new_id;
12218         rdata.append(ss);
12219       }
12220       err = 0;
12221       goto reply;
12222     }
12223
12224     string empty_device_class;
12225     do_osd_create(id, uuid, empty_device_class, &new_id);
12226
12227     if (f) {
12228       f->open_object_section("created_osd");
12229       f->dump_int("osdid", new_id);
12230       f->close_section();
12231       f->flush(rdata);
12232     } else {
12233       ss << new_id;
12234       rdata.append(ss);
12235     }
12236     wait_for_finished_proposal(op,
12237         new Monitor::C_Command(mon, op, 0, rs, rdata,
12238                                get_last_committed() + 1));
12239     return true;
12240
12241   } else if (prefix == "osd blacklist clear") {
12242     pending_inc.new_blacklist.clear();
12243     std::list<std::pair<entity_addr_t,utime_t > > blacklist;
12244     osdmap.get_blacklist(&blacklist);
12245     for (const auto &entry : blacklist) {
12246       pending_inc.old_blacklist.push_back(entry.first);
12247     }
12248     ss << " removed all blacklist entries";
12249     getline(ss, rs);
12250     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12251                                               get_last_committed() + 1));
12252     return true;
12253   } else if (prefix == "osd blacklist") {
12254     string addrstr;
12255     cmd_getval(cmdmap, "addr", addrstr);
12256     entity_addr_t addr;
12257     if (!addr.parse(addrstr.c_str(), 0)) {
12258       ss << "unable to parse address " << addrstr;
12259       err = -EINVAL;
12260       goto reply;
12261     }
12262     else {
12263       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12264         // always blacklist type ANY
12265         addr.set_type(entity_addr_t::TYPE_ANY);
12266       } else {
12267         addr.set_type(entity_addr_t::TYPE_LEGACY);
12268       }
12269
12270       string blacklistop;
12271       cmd_getval(cmdmap, "blacklistop", blacklistop);
12272       if (blacklistop == "add") {
12273         utime_t expires = ceph_clock_now();
12274         double d;
12275         // default one hour
12276         cmd_getval(cmdmap, "expire", d,
12277           g_conf()->mon_osd_blacklist_default_expire);
12278         expires += d;
12279
12280         pending_inc.new_blacklist[addr] = expires;
12281
12282         {
12283           // cancel any pending un-blacklisting request too
12284           auto it = std::find(pending_inc.old_blacklist.begin(),
12285             pending_inc.old_blacklist.end(), addr);
12286           if (it != pending_inc.old_blacklist.end()) {
12287             pending_inc.old_blacklist.erase(it);
12288           }
12289         }
12290
12291         ss << "blacklisting " << addr << " until " << expires << " (" << d << " sec)";
12292         getline(ss, rs);
12293         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12294                                                   get_last_committed() + 1));
12295         return true;
12296       } else if (blacklistop == "rm") {
12297         if (osdmap.is_blacklisted(addr) ||
12298             pending_inc.new_blacklist.count(addr)) {
12299           if (osdmap.is_blacklisted(addr))
12300             pending_inc.old_blacklist.push_back(addr);
12301           else
12302             pending_inc.new_blacklist.erase(addr);
12303           ss << "un-blacklisting " << addr;
12304           getline(ss, rs);
12305           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12306                                                     get_last_committed() + 1));
12307           return true;
12308         }
12309         ss << addr << " isn't blacklisted";
12310         err = 0;
12311         goto reply;
12312       }
12313     }
12314   } else if (prefix == "osd pool mksnap") {
12315     string poolstr;
12316     cmd_getval(cmdmap, "pool", poolstr);
12317     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12318     if (pool < 0) {
12319       ss << "unrecognized pool '" << poolstr << "'";
12320       err = -ENOENT;
12321       goto reply;
12322     }
12323     string snapname;
12324     cmd_getval(cmdmap, "snap", snapname);
12325     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12326     if (p->is_unmanaged_snaps_mode()) {
12327       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12328       err = -EINVAL;
12329       goto reply;
12330     } else if (p->snap_exists(snapname.c_str())) {
12331       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12332       err = 0;
12333       goto reply;
12334     } else if (p->is_tier()) {
12335       ss << "pool " << poolstr << " is a cache tier";
12336       err = -EINVAL;
12337       goto reply;
12338     }
12339     pg_pool_t *pp = 0;
12340     if (pending_inc.new_pools.count(pool))
12341       pp = &pending_inc.new_pools[pool];
12342     if (!pp) {
12343       pp = &pending_inc.new_pools[pool];
12344       *pp = *p;
12345     }
12346     if (pp->snap_exists(snapname.c_str())) {
12347       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12348     } else {
12349       pp->add_snap(snapname.c_str(), ceph_clock_now());
12350       pp->set_snap_epoch(pending_inc.epoch);
12351       ss << "created pool " << poolstr << " snap " << snapname;
12352     }
12353     getline(ss, rs);
12354     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12355                                               get_last_committed() + 1));
12356     return true;
12357   } else if (prefix == "osd pool rmsnap") {
12358     string poolstr;
12359     cmd_getval(cmdmap, "pool", poolstr);
12360     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12361     if (pool < 0) {
12362       ss << "unrecognized pool '" << poolstr << "'";
12363       err = -ENOENT;
12364       goto reply;
12365     }
12366     string snapname;
12367     cmd_getval(cmdmap, "snap", snapname);
12368     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12369     if (p->is_unmanaged_snaps_mode()) {
12370       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12371       err = -EINVAL;
12372       goto reply;
12373     } else if (!p->snap_exists(snapname.c_str())) {
12374       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12375       err = 0;
12376       goto reply;
12377     }
12378     pg_pool_t *pp = 0;
12379     if (pending_inc.new_pools.count(pool))
12380       pp = &pending_inc.new_pools[pool];
12381     if (!pp) {
12382       pp = &pending_inc.new_pools[pool];
12383       *pp = *p;
12384     }
12385     snapid_t sn = pp->snap_exists(snapname.c_str());
12386     if (sn) {
12387       pp->remove_snap(sn);
12388       pp->set_snap_epoch(pending_inc.epoch);
12389       ss << "removed pool " << poolstr << " snap " << snapname;
12390     } else {
12391       ss << "already removed pool " << poolstr << " snap " << snapname;
12392     }
12393     getline(ss, rs);
12394     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12395                                               get_last_committed() + 1));
12396     return true;
12397   } else if (prefix == "osd pool create") {
12398     int64_t pg_num, pg_num_min;
12399     int64_t pgp_num;
12400     cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12401     cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12402     cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12403
12404     string pool_type_str;
12405     cmd_getval(cmdmap, "pool_type", pool_type_str);
12406     if (pool_type_str.empty())
12407       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12408
12409     string poolstr;
12410     cmd_getval(cmdmap, "pool", poolstr);
12411     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12412     if (pool_id >= 0) {
12413       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12414       if (pool_type_str != p->get_type_name()) {
12415         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12416         err = -EINVAL;
12417       } else {
12418         ss << "pool '" << poolstr << "' already exists";
12419         err = 0;
12420       }
12421       goto reply;
12422     }
12423
12424     int pool_type;
12425     if (pool_type_str == "replicated") {
12426       pool_type = pg_pool_t::TYPE_REPLICATED;
12427     } else if (pool_type_str == "erasure") {
12428       pool_type = pg_pool_t::TYPE_ERASURE;
12429     } else {
12430       ss << "unknown pool type '" << pool_type_str << "'";
12431       err = -EINVAL;
12432       goto reply;
12433     }
12434
12435     bool implicit_rule_creation = false;
12436     int64_t expected_num_objects = 0;
12437     string rule_name;
12438     cmd_getval(cmdmap, "rule", rule_name);
12439     string erasure_code_profile;
12440     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12441
12442     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12443       if (erasure_code_profile == "")
12444         erasure_code_profile = "default";
12445       //handle the erasure code profile
12446       if (erasure_code_profile == "default") {
12447         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12448           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12449             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12450             goto wait;
12451           }
12452
12453           map<string,string> profile_map;
12454           err = osdmap.get_erasure_code_profile_default(cct,
12455                                                       profile_map,
12456                                                       &ss);
12457           if (err)
12458             goto reply;
12459           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12460           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12461           goto wait;
12462         }
12463       }
12464       if (rule_name == "") {
12465         implicit_rule_creation = true;
12466         if (erasure_code_profile == "default") {
12467           rule_name = "erasure-code";
12468         } else {
12469           dout(1) << "implicitly use rule named after the pool: "
12470                 << poolstr << dendl;
12471           rule_name = poolstr;
12472         }
12473       }
12474       cmd_getval(cmdmap, "expected_num_objects",
12475                  expected_num_objects, int64_t(0));
12476     } else {
12477       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12478       //     and put expected_num_objects to rule field
12479       if (erasure_code_profile != "") { // cmd is from CLI
12480         if (rule_name != "") {
12481           string interr;
12482           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12483           if (interr.length()) {
12484             ss << "error parsing integer value '" << rule_name << "': " << interr;
12485             err = -EINVAL;
12486             goto reply;
12487           }
12488         }
12489         rule_name = erasure_code_profile;
12490       } else { // cmd is well-formed
12491         cmd_getval(cmdmap, "expected_num_objects",
12492                    expected_num_objects, int64_t(0));
12493       }
12494     }
12495
12496     if (!implicit_rule_creation && rule_name != "") {
12497       int rule;
12498       err = get_crush_rule(rule_name, &rule, &ss);
12499       if (err == -EAGAIN) {
12500         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12501         return true;
12502       }
12503       if (err)
12504         goto reply;
12505     }
12506
12507     if (expected_num_objects < 0) {
12508       ss << "'expected_num_objects' must be non-negative";
12509       err = -EINVAL;
12510       goto reply;
12511     }
12512
12513     if (expected_num_objects > 0 &&
12514         cct->_conf->osd_objectstore == "filestore" &&
12515         cct->_conf->filestore_merge_threshold > 0) {
12516       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12517       err = -EINVAL;
12518       goto reply;
12519     }
12520
12521     if (expected_num_objects == 0 &&
12522         cct->_conf->osd_objectstore == "filestore" &&
12523         cct->_conf->filestore_merge_threshold < 0) {
12524       int osds = osdmap.get_num_osds();
12525       if (osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12526         ss << "For better initial performance on pools expected to store a "
12527            << "large number of objects, consider supplying the "
12528            << "expected_num_objects parameter when creating the pool.\n";
12529       }
12530     }
12531
12532     int64_t fast_read_param;
12533     cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12534     FastReadType fast_read = FAST_READ_DEFAULT;
12535     if (fast_read_param == 0)
12536       fast_read = FAST_READ_OFF;
12537     else if (fast_read_param > 0)
12538       fast_read = FAST_READ_ON;
12539
12540     int64_t repl_size = 0;
12541     cmd_getval(cmdmap, "size", repl_size);
12542     int64_t target_size_bytes = 0;
12543     double target_size_ratio = 0.0;
12544     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12545     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12546
12547     string pg_autoscale_mode;
12548     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12549
12550     err = prepare_new_pool(poolstr,
12551                            -1, // default crush rule
12552                            rule_name,
12553                            pg_num, pgp_num, pg_num_min,
12554                            repl_size, target_size_bytes, target_size_ratio,
12555                            erasure_code_profile, pool_type,
12556                            (uint64_t)expected_num_objects,
12557                            fast_read,
12558                            pg_autoscale_mode,
12559                            &ss);
12560     if (err < 0) {
12561       switch(err) {
12562       case -EEXIST:
12563         ss << "pool '" << poolstr << "' already exists";
12564         break;
12565       case -EAGAIN:
12566         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12567         return true;
12568       case -ERANGE:
12569         goto reply;
12570       default:
12571         goto reply;
12572         break;
12573       }
12574     } else {
12575       ss << "pool '" << poolstr << "' created";
12576     }
12577     getline(ss, rs);
12578     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12579                                               get_last_committed() + 1));
12580     return true;
12581
12582   } else if (prefix == "osd pool delete" ||
12583              prefix == "osd pool rm") {
12584     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12585     string poolstr, poolstr2, sure;
12586     cmd_getval(cmdmap, "pool", poolstr);
12587     cmd_getval(cmdmap, "pool2", poolstr2);
12588     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12589     if (pool < 0) {
12590       ss << "pool '" << poolstr << "' does not exist";
12591       err = 0;
12592       goto reply;
12593     }
12594
12595     bool force_no_fake = false;
12596     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12597     bool force = false;
12598     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12599     if (poolstr2 != poolstr ||
12600         (!force && !force_no_fake)) {
12601       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12602          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12603          << "followed by --yes-i-really-really-mean-it.";
12604       err = -EPERM;
12605       goto reply;
12606     }
12607     err = _prepare_remove_pool(pool, &ss, force_no_fake);
12608     if (err == -EAGAIN) {
12609       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12610       return true;
12611     }
12612     if (err < 0)
12613       goto reply;
12614     goto update;
12615   } else if (prefix == "osd pool rename") {
12616     string srcpoolstr, destpoolstr;
12617     cmd_getval(cmdmap, "srcpool", srcpoolstr);
12618     cmd_getval(cmdmap, "destpool", destpoolstr);
12619     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12620     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12621
12622     if (pool_src < 0) {
12623       if (pool_dst >= 0) {
12624         // src pool doesn't exist, dst pool does exist: to ensure idempotency
12625         // of operations, assume this rename succeeded, as it is not changing
12626         // the current state.  Make sure we output something understandable
12627         // for whoever is issuing the command, if they are paying attention,
12628         // in case it was not intentional; or to avoid a "wtf?" and a bug
12629         // report in case it was intentional, while expecting a failure.
12630         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12631           << destpoolstr << "' does -- assuming successful rename";
12632         err = 0;
12633       } else {
12634         ss << "unrecognized pool '" << srcpoolstr << "'";
12635         err = -ENOENT;
12636       }
12637       goto reply;
12638     } else if (pool_dst >= 0) {
12639       // source pool exists and so does the destination pool
12640       ss << "pool '" << destpoolstr << "' already exists";
12641       err = -EEXIST;
12642       goto reply;
12643     }
12644
12645     int ret = _prepare_rename_pool(pool_src, destpoolstr);
12646     if (ret == 0) {
12647       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12648     } else {
12649       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12650         << cpp_strerror(ret);
12651     }
12652     getline(ss, rs);
12653     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12654                                               get_last_committed() + 1));
12655     return true;
12656
12657   } else if (prefix == "osd pool set") {
12658     err = prepare_command_pool_set(cmdmap, ss);
12659     if (err == -EAGAIN)
12660       goto wait;
12661     if (err < 0)
12662       goto reply;
12663
12664     getline(ss, rs);
12665     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12666                                                    get_last_committed() + 1));
12667     return true;
12668   } else if (prefix == "osd tier add") {
12669     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12670     if (err == -EAGAIN)
12671       goto wait;
12672     if (err)
12673       goto reply;
12674     string poolstr;
12675     cmd_getval(cmdmap, "pool", poolstr);
12676     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12677     if (pool_id < 0) {
12678       ss << "unrecognized pool '" << poolstr << "'";
12679       err = -ENOENT;
12680       goto reply;
12681     }
12682     string tierpoolstr;
12683     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12684     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12685     if (tierpool_id < 0) {
12686       ss << "unrecognized pool '" << tierpoolstr << "'";
12687       err = -ENOENT;
12688       goto reply;
12689     }
12690     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12691     ceph_assert(p);
12692     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12693     ceph_assert(tp);
12694
12695     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
12696       goto reply;
12697     }
12698
12699     // make sure new tier is empty
12700     string force_nonempty;
12701     cmd_getval(cmdmap, "force_nonempty", force_nonempty);
12702     const pool_stat_t *pstats = mon->mgrstatmon()->get_pool_stat(tierpool_id);
12703     if (pstats && pstats->stats.sum.num_objects != 0 &&
12704         force_nonempty != "--force-nonempty") {
12705       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
12706       err = -ENOTEMPTY;
12707       goto reply;
12708     }
12709     if (tp->is_erasure()) {
12710       ss << "tier pool '" << tierpoolstr
12711          << "' is an ec pool, which cannot be a tier";
12712       err = -ENOTSUP;
12713       goto reply;
12714     }
12715     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
12716         ((force_nonempty != "--force-nonempty") ||
12717          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
12718       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
12719       err = -ENOTEMPTY;
12720       goto reply;
12721     }
12722     // go
12723     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12724     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12725     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
12726       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12727       return true;
12728     }
12729     np->tiers.insert(tierpool_id);
12730     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
12731     ntp->tier_of = pool_id;
12732     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
12733     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12734                                               get_last_committed() + 1));
12735     return true;
12736   } else if (prefix == "osd tier remove" ||
12737              prefix == "osd tier rm") {
12738     string poolstr;
12739     cmd_getval(cmdmap, "pool", poolstr);
12740     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12741     if (pool_id < 0) {
12742       ss << "unrecognized pool '" << poolstr << "'";
12743       err = -ENOENT;
12744       goto reply;
12745     }
12746     string tierpoolstr;
12747     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12748     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12749     if (tierpool_id < 0) {
12750       ss << "unrecognized pool '" << tierpoolstr << "'";
12751       err = -ENOENT;
12752       goto reply;
12753     }
12754     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12755     ceph_assert(p);
12756     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
12757     ceph_assert(tp);
12758
12759     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
12760       goto reply;
12761     }
12762
12763     if (p->tiers.count(tierpool_id) == 0) {
12764       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12765       err = 0;
12766       goto reply;
12767     }
12768     if (tp->tier_of != pool_id) {
12769       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
12770          << osdmap.get_pool_name(tp->tier_of) << "': "
12771          // be scary about it; this is an inconsistency and bells must go off
12772          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
12773       err = -EINVAL;
12774       goto reply;
12775     }
12776     if (p->read_tier == tierpool_id) {
12777       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
12778       err = -EBUSY;
12779       goto reply;
12780     }
12781     // go
12782     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12783     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
12784     if (np->tiers.count(tierpool_id) == 0 ||
12785         ntp->tier_of != pool_id ||
12786         np->read_tier == tierpool_id) {
12787       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12788       return true;
12789     }
12790     np->tiers.erase(tierpool_id);
12791     ntp->clear_tier();
12792     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
12793     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12794                                               get_last_committed() + 1));
12795     return true;
12796   } else if (prefix == "osd tier set-overlay") {
12797     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12798     if (err == -EAGAIN)
12799       goto wait;
12800     if (err)
12801       goto reply;
12802     string poolstr;
12803     cmd_getval(cmdmap, "pool", poolstr);
12804     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12805     if (pool_id < 0) {
12806       ss << "unrecognized pool '" << poolstr << "'";
12807       err = -ENOENT;
12808       goto reply;
12809     }
12810     string overlaypoolstr;
12811     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
12812     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
12813     if (overlaypool_id < 0) {
12814       ss << "unrecognized pool '" << overlaypoolstr << "'";
12815       err = -ENOENT;
12816       goto reply;
12817     }
12818     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12819     ceph_assert(p);
12820     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
12821     ceph_assert(overlay_p);
12822     if (p->tiers.count(overlaypool_id) == 0) {
12823       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
12824       err = -EINVAL;
12825       goto reply;
12826     }
12827     if (p->read_tier == overlaypool_id) {
12828       err = 0;
12829       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12830       goto reply;
12831     }
12832     if (p->has_read_tier()) {
12833       ss << "pool '" << poolstr << "' has overlay '"
12834          << osdmap.get_pool_name(p->read_tier)
12835          << "'; please remove-overlay first";
12836       err = -EINVAL;
12837       goto reply;
12838     }
12839
12840     // go
12841     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12842     np->read_tier = overlaypool_id;
12843     np->write_tier = overlaypool_id;
12844     np->set_last_force_op_resend(pending_inc.epoch);
12845     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
12846     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
12847     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
12848     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
12849       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
12850     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12851                                               get_last_committed() + 1));
12852     return true;
12853   } else if (prefix == "osd tier remove-overlay" ||
12854              prefix == "osd tier rm-overlay") {
12855     string poolstr;
12856     cmd_getval(cmdmap, "pool", poolstr);
12857     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12858     if (pool_id < 0) {
12859       ss << "unrecognized pool '" << poolstr << "'";
12860       err = -ENOENT;
12861       goto reply;
12862     }
12863     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12864     ceph_assert(p);
12865     if (!p->has_read_tier()) {
12866       err = 0;
12867       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12868       goto reply;
12869     }
12870
12871     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
12872       goto reply;
12873     }
12874
12875     // go
12876     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
12877     if (np->has_read_tier()) {
12878       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
12879       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
12880       nop->set_last_force_op_resend(pending_inc.epoch);
12881     }
12882     if (np->has_write_tier()) {
12883       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
12884       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
12885       nop->set_last_force_op_resend(pending_inc.epoch);
12886     }
12887     np->clear_read_tier();
12888     np->clear_write_tier();
12889     np->set_last_force_op_resend(pending_inc.epoch);
12890     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
12891     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
12892                                               get_last_committed() + 1));
12893     return true;
12894   } else if (prefix == "osd tier cache-mode") {
12895     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12896     if (err == -EAGAIN)
12897       goto wait;
12898     if (err)
12899       goto reply;
12900     string poolstr;
12901     cmd_getval(cmdmap, "pool", poolstr);
12902     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12903     if (pool_id < 0) {
12904       ss << "unrecognized pool '" << poolstr << "'";
12905       err = -ENOENT;
12906       goto reply;
12907     }
12908     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12909     ceph_assert(p);
12910     if (!p->is_tier()) {
12911       ss << "pool '" << poolstr << "' is not a tier";
12912       err = -EINVAL;
12913       goto reply;
12914     }
12915     string modestr;
12916     cmd_getval(cmdmap, "mode", modestr);
12917     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
12918     if (int(mode) < 0) {
12919       ss << "'" << modestr << "' is not a valid cache mode";
12920       err = -EINVAL;
12921       goto reply;
12922     }
12923
12924     bool sure = false;
12925     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12926
12927     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
12928         mode == pg_pool_t::CACHEMODE_READFORWARD) {
12929       ss << "'" << modestr << "' is no longer a supported cache mode";
12930       err = -EPERM;
12931       goto reply;
12932     }
12933     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12934          mode != pg_pool_t::CACHEMODE_NONE &&
12935          mode != pg_pool_t::CACHEMODE_PROXY &&
12936          mode != pg_pool_t::CACHEMODE_READPROXY) &&
12937          !sure) {
12938       ss << "'" << modestr << "' is not a well-supported cache mode and may "
12939          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
12940       err = -EPERM;
12941       goto reply;
12942     }
12943
12944     // pool already has this cache-mode set and there are no pending changes
12945     if (p->cache_mode == mode &&
12946         (pending_inc.new_pools.count(pool_id) == 0 ||
12947          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
12948       ss << "set cache-mode for pool '" << poolstr << "'"
12949          << " to " << pg_pool_t::get_cache_mode_name(mode);
12950       err = 0;
12951       goto reply;
12952     }
12953
12954     /* Mode description:
12955      *
12956      *  none:       No cache-mode defined
12957      *  forward:    Forward all reads and writes to base pool [removed]
12958      *  writeback:  Cache writes, promote reads from base pool
12959      *  readonly:   Forward writes to base pool
12960      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
12961      *  proxy:       Proxy all reads and writes to base pool
12962      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
12963      *
12964      * Hence, these are the allowed transitions:
12965      *
12966      *  none -> any
12967      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
12968      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
12969      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
12970      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
12971      *  writeback -> readproxy || proxy
12972      *  readonly -> any
12973      */
12974
12975     // We check if the transition is valid against the current pool mode, as
12976     // it is the only committed state thus far.  We will blantly squash
12977     // whatever mode is on the pending state.
12978
12979     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
12980         (mode != pg_pool_t::CACHEMODE_PROXY &&
12981           mode != pg_pool_t::CACHEMODE_READPROXY)) {
12982       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
12983          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
12984          << "' pool; only '"
12985          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
12986          << "','"
12987          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
12988         << "' allowed.";
12989       err = -EINVAL;
12990       goto reply;
12991     }
12992     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
12993         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12994           mode != pg_pool_t::CACHEMODE_PROXY &&
12995           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
12996
12997         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
12998         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12999           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13000
13001         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13002         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13003           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13004
13005         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13006         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13007           mode != pg_pool_t::CACHEMODE_PROXY &&
13008           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13009
13010       const pool_stat_t* pstats =
13011         mon->mgrstatmon()->get_pool_stat(pool_id);
13012
13013       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13014         ss << "unable to set cache-mode '"
13015            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13016            << "': dirty objects found";
13017         err = -EBUSY;
13018         goto reply;
13019       }
13020     }
13021     // go
13022     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13023     np->cache_mode = mode;
13024     // set this both when moving to and from cache_mode NONE.  this is to
13025     // capture legacy pools that were set up before this flag existed.
13026     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13027     ss << "set cache-mode for pool '" << poolstr
13028         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13029     if (mode == pg_pool_t::CACHEMODE_NONE) {
13030       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13031       ceph_assert(base_pool);
13032       if (base_pool->read_tier == pool_id ||
13033           base_pool->write_tier == pool_id)
13034         ss <<" (WARNING: pool is still configured as read or write tier)";
13035     }
13036     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13037                                               get_last_committed() + 1));
13038     return true;
13039   } else if (prefix == "osd tier add-cache") {
13040     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13041     if (err == -EAGAIN)
13042       goto wait;
13043     if (err)
13044       goto reply;
13045     string poolstr;
13046     cmd_getval(cmdmap, "pool", poolstr);
13047     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13048     if (pool_id < 0) {
13049       ss << "unrecognized pool '" << poolstr << "'";
13050       err = -ENOENT;
13051       goto reply;
13052     }
13053     string tierpoolstr;
13054     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13055     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13056     if (tierpool_id < 0) {
13057       ss << "unrecognized pool '" << tierpoolstr << "'";
13058       err = -ENOENT;
13059       goto reply;
13060     }
13061     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13062     ceph_assert(p);
13063     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13064     ceph_assert(tp);
13065
13066     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13067       goto reply;
13068     }
13069
13070     int64_t size = 0;
13071     if (!cmd_getval(cmdmap, "size", size)) {
13072       ss << "unable to parse 'size' value '"
13073          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13074       err = -EINVAL;
13075       goto reply;
13076     }
13077     // make sure new tier is empty
13078     const pool_stat_t *pstats =
13079       mon->mgrstatmon()->get_pool_stat(tierpool_id);
13080     if (pstats && pstats->stats.sum.num_objects != 0) {
13081       ss << "tier pool '" << tierpoolstr << "' is not empty";
13082       err = -ENOTEMPTY;
13083       goto reply;
13084     }
13085     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13086     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13087     if (int(mode) < 0) {
13088       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13089       err = -EINVAL;
13090       goto reply;
13091     }
13092     HitSet::Params hsp;
13093     auto& cache_hit_set_type =
13094       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13095     if (cache_hit_set_type == "bloom") {
13096       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13097       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13098       hsp = HitSet::Params(bsp);
13099     } else if (cache_hit_set_type == "explicit_hash") {
13100       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13101     } else if (cache_hit_set_type == "explicit_object") {
13102       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13103     } else {
13104       ss << "osd tier cache default hit set type '"
13105          << cache_hit_set_type << "' is not a known type";
13106       err = -EINVAL;
13107       goto reply;
13108     }
13109     // go
13110     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13111     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13112     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13113       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13114       return true;
13115     }
13116     np->tiers.insert(tierpool_id);
13117     np->read_tier = np->write_tier = tierpool_id;
13118     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13119     np->set_last_force_op_resend(pending_inc.epoch);
13120     ntp->set_last_force_op_resend(pending_inc.epoch);
13121     ntp->tier_of = pool_id;
13122     ntp->cache_mode = mode;
13123     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13124     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13125     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13126     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13127     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13128     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13129     ntp->hit_set_params = hsp;
13130     ntp->target_max_bytes = size;
13131     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13132     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13133                                               get_last_committed() + 1));
13134     return true;
13135   } else if (prefix == "osd pool set-quota") {
13136     string poolstr;
13137     cmd_getval(cmdmap, "pool", poolstr);
13138     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13139     if (pool_id < 0) {
13140       ss << "unrecognized pool '" << poolstr << "'";
13141       err = -ENOENT;
13142       goto reply;
13143     }
13144
13145     string field;
13146     cmd_getval(cmdmap, "field", field);
13147     if (field != "max_objects" && field != "max_bytes") {
13148       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13149       err = -EINVAL;
13150       goto reply;
13151     }
13152
13153     // val could contain unit designations, so we treat as a string
13154     string val;
13155     cmd_getval(cmdmap, "val", val);
13156     string tss;
13157     int64_t value;
13158     if (field == "max_objects") {
13159       value = strict_sistrtoll(val.c_str(), &tss);
13160     } else if (field == "max_bytes") {
13161       value = strict_iecstrtoll(val.c_str(), &tss);
13162     } else {
13163       ceph_abort_msg("unrecognized option");
13164     }
13165     if (!tss.empty()) {
13166       ss << "error parsing value '" << val << "': " << tss;
13167       err = -EINVAL;
13168       goto reply;
13169     }
13170
13171     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13172     if (field == "max_objects") {
13173       pi->quota_max_objects = value;
13174     } else if (field == "max_bytes") {
13175       pi->quota_max_bytes = value;
13176     } else {
13177       ceph_abort_msg("unrecognized option");
13178     }
13179     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13180     rs = ss.str();
13181     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13182                                               get_last_committed() + 1));
13183     return true;
13184   } else if (prefix == "osd pool application enable" ||
13185              prefix == "osd pool application disable" ||
13186              prefix == "osd pool application set" ||
13187              prefix == "osd pool application rm") {
13188     err = prepare_command_pool_application(prefix, cmdmap, ss);
13189     if (err == -EAGAIN) {
13190       goto wait;
13191     } else if (err < 0) {
13192       goto reply;
13193     } else {
13194       goto update;
13195     }
13196   } else if (prefix == "osd force-create-pg") {
13197     pg_t pgid;
13198     string pgidstr;
13199     cmd_getval(cmdmap, "pgid", pgidstr);
13200     if (!pgid.parse(pgidstr.c_str())) {
13201       ss << "invalid pgid '" << pgidstr << "'";
13202       err = -EINVAL;
13203       goto reply;
13204     }
13205     if (!osdmap.pg_exists(pgid)) {
13206       ss << "pg " << pgid << " should not exist";
13207       err = -ENOENT;
13208       goto reply;
13209     }
13210     bool sure = false;
13211     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13212     if (!sure) {
13213       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13214          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13215          << "only if you are certain that all copies of the PG are in fact lost and you are "
13216          << "willing to accept that the data is permanently destroyed.  Pass "
13217          << "--yes-i-really-mean-it to proceed.";
13218       err = -EPERM;
13219       goto reply;
13220     }
13221     bool creating_now;
13222     {
13223       std::lock_guard<std::mutex> l(creating_pgs_lock);
13224       auto emplaced = creating_pgs.pgs.emplace(
13225         pgid,
13226         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13227                                        ceph_clock_now()));
13228       creating_now = emplaced.second;
13229     }
13230     if (creating_now) {
13231       ss << "pg " << pgidstr << " now creating, ok";
13232       // set the pool's CREATING flag so that (1) the osd won't ignore our
13233       // create message and (2) we won't propose any future pg_num changes
13234       // until after the PG has been instantiated.
13235       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13236         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13237       }
13238       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13239       err = 0;
13240       goto update;
13241     } else {
13242       ss << "pg " << pgid << " already creating";
13243       err = 0;
13244       goto reply;
13245     }
13246   } else {
13247     err = -EINVAL;
13248   }
13249
13250  reply:
13251   getline(ss, rs);
13252   if (err < 0 && rs.length() == 0)
13253     rs = cpp_strerror(err);
13254   mon->reply_command(op, err, rs, rdata, get_last_committed());
13255   return ret;
13256
13257  update:
13258   getline(ss, rs);
13259   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13260                                             get_last_committed() + 1));
13261   return true;
13262
13263  wait:
13264   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13265   return true;
13266 }
13267
13268 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13269 {
13270   op->mark_osdmon_event(__func__);
13271
13272   auto m = op->get_req<MPoolOp>();
13273   MonSession *session = op->get_session();
13274   if (!session) {
13275     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13276     return true;
13277   }
13278
13279   switch (m->op) {
13280   case POOL_OP_CREATE_UNMANAGED_SNAP:
13281   case POOL_OP_DELETE_UNMANAGED_SNAP:
13282     {
13283       const std::string* pool_name = nullptr;
13284       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13285       if (pg_pool != nullptr) {
13286         pool_name = &osdmap.get_pool_name(m->pool);
13287       }
13288
13289       if (!is_unmanaged_snap_op_permitted(cct, mon->key_server,
13290                                           session->entity_name, session->caps,
13291                                           session->get_peer_socket_addr(),
13292                                           pool_name)) {
13293         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13294                 << "privileges. message: " << *m  << std::endl
13295                 << "caps: " << session->caps << dendl;
13296         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13297         return true;
13298       }
13299     }
13300     break;
13301   default:
13302     if (!session->is_capable("osd", MON_CAP_W)) {
13303       dout(0) << "got pool op from entity with insufficient privileges. "
13304               << "message: " << *m  << std::endl
13305               << "caps: " << session->caps << dendl;
13306       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13307       return true;
13308     }
13309     break;
13310   }
13311
13312   return false;
13313 }
13314
13315 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13316 {
13317   op->mark_osdmon_event(__func__);
13318   auto m = op->get_req<MPoolOp>();
13319
13320   if (enforce_pool_op_caps(op)) {
13321     return true;
13322   }
13323
13324   if (m->fsid != mon->monmap->fsid) {
13325     dout(0) << __func__ << " drop message on fsid " << m->fsid
13326             << " != " << mon->monmap->fsid << " for " << *m << dendl;
13327     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13328     return true;
13329   }
13330
13331   if (m->op == POOL_OP_CREATE)
13332     return preprocess_pool_op_create(op);
13333
13334   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13335   if (p == nullptr) {
13336     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13337     if (m->op == POOL_OP_DELETE) {
13338       _pool_op_reply(op, 0, osdmap.get_epoch());
13339     } else {
13340       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13341     }
13342     return true;
13343   }
13344
13345   // check if the snap and snapname exist
13346   bool snap_exists = false;
13347   if (p->snap_exists(m->name.c_str()))
13348     snap_exists = true;
13349
13350   switch (m->op) {
13351   case POOL_OP_CREATE_SNAP:
13352     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13353       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13354       return true;
13355     }
13356     if (snap_exists) {
13357       _pool_op_reply(op, 0, osdmap.get_epoch());
13358       return true;
13359     }
13360     return false;
13361   case POOL_OP_CREATE_UNMANAGED_SNAP:
13362     if (p->is_pool_snaps_mode()) {
13363       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13364       return true;
13365     }
13366     return false;
13367   case POOL_OP_DELETE_SNAP:
13368     if (p->is_unmanaged_snaps_mode()) {
13369       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13370       return true;
13371     }
13372     if (!snap_exists) {
13373       _pool_op_reply(op, 0, osdmap.get_epoch());
13374       return true;
13375     }
13376     return false;
13377   case POOL_OP_DELETE_UNMANAGED_SNAP:
13378     if (p->is_pool_snaps_mode()) {
13379       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13380       return true;
13381     }
13382     if (_is_removed_snap(m->pool, m->snapid)) {
13383       _pool_op_reply(op, 0, osdmap.get_epoch());
13384       return true;
13385     }
13386     return false;
13387   case POOL_OP_DELETE:
13388     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13389       _pool_op_reply(op, 0, osdmap.get_epoch());
13390       return true;
13391     }
13392     return false;
13393   case POOL_OP_AUID_CHANGE:
13394     return false;
13395   default:
13396     ceph_abort();
13397     break;
13398   }
13399
13400   return false;
13401 }
13402
13403 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13404 {
13405   if (!osdmap.have_pg_pool(pool)) {
13406     dout(10) << __func__ << " pool " << pool << " snap " << snap
13407              << " - pool dne" << dendl;
13408     return true;
13409   }
13410   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13411     dout(10) << __func__ << " pool " << pool << " snap " << snap
13412              << " - in osdmap removed_snaps_queue" << dendl;
13413     return true;
13414   }
13415   snapid_t begin, end;
13416   int r = lookup_purged_snap(pool, snap, &begin, &end);
13417   if (r == 0) {
13418     dout(10) << __func__ << " pool " << pool << " snap " << snap
13419              << " - purged, [" << begin << "," << end << ")" << dendl;
13420     return true;
13421   }
13422   return false;
13423 }
13424
13425 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13426 {
13427   if (pending_inc.old_pools.count(pool)) {
13428     dout(10) << __func__ << " pool " << pool << " snap " << snap
13429              << " - pool pending deletion" << dendl;
13430     return true;
13431   }
13432   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13433     dout(10) << __func__ << " pool " << pool << " snap " << snap
13434              << " - in pending new_removed_snaps" << dendl;
13435     return true;
13436   }
13437   return false;
13438 }
13439
13440 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13441 {
13442   op->mark_osdmon_event(__func__);
13443   auto m = op->get_req<MPoolOp>();
13444   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13445   if (pool >= 0) {
13446     _pool_op_reply(op, 0, osdmap.get_epoch());
13447     return true;
13448   }
13449
13450   return false;
13451 }
13452
13453 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13454 {
13455   op->mark_osdmon_event(__func__);
13456   auto m = op->get_req<MPoolOp>();
13457   dout(10) << "prepare_pool_op " << *m << dendl;
13458   if (m->op == POOL_OP_CREATE) {
13459     return prepare_pool_op_create(op);
13460   } else if (m->op == POOL_OP_DELETE) {
13461     return prepare_pool_op_delete(op);
13462   }
13463
13464   int ret = 0;
13465   bool changed = false;
13466
13467   if (!osdmap.have_pg_pool(m->pool)) {
13468     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13469     return false;
13470   }
13471
13472   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13473
13474   switch (m->op) {
13475     case POOL_OP_CREATE_SNAP:
13476       if (pool->is_tier()) {
13477         ret = -EINVAL;
13478         _pool_op_reply(op, ret, osdmap.get_epoch());
13479         return false;
13480       }  // else, fall through
13481     case POOL_OP_DELETE_SNAP:
13482       if (!pool->is_unmanaged_snaps_mode()) {
13483         bool snap_exists = pool->snap_exists(m->name.c_str());
13484         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13485           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13486           ret = 0;
13487         } else {
13488           break;
13489         }
13490       } else {
13491         ret = -EINVAL;
13492       }
13493       _pool_op_reply(op, ret, osdmap.get_epoch());
13494       return false;
13495
13496     case POOL_OP_DELETE_UNMANAGED_SNAP:
13497       // we won't allow removal of an unmanaged snapshot from a pool
13498       // not in unmanaged snaps mode.
13499       if (!pool->is_unmanaged_snaps_mode()) {
13500         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13501         return false;
13502       }
13503       /* fall-thru */
13504     case POOL_OP_CREATE_UNMANAGED_SNAP:
13505       // but we will allow creating an unmanaged snapshot on any pool
13506       // as long as it is not in 'pool' snaps mode.
13507       if (pool->is_pool_snaps_mode()) {
13508         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13509         return false;
13510       }
13511   }
13512
13513   // projected pool info
13514   pg_pool_t pp;
13515   if (pending_inc.new_pools.count(m->pool))
13516     pp = pending_inc.new_pools[m->pool];
13517   else
13518     pp = *osdmap.get_pg_pool(m->pool);
13519
13520   bufferlist reply_data;
13521
13522   // pool snaps vs unmanaged snaps are mutually exclusive
13523   switch (m->op) {
13524   case POOL_OP_CREATE_SNAP:
13525   case POOL_OP_DELETE_SNAP:
13526     if (pp.is_unmanaged_snaps_mode()) {
13527       ret = -EINVAL;
13528       goto out;
13529     }
13530     break;
13531
13532   case POOL_OP_CREATE_UNMANAGED_SNAP:
13533   case POOL_OP_DELETE_UNMANAGED_SNAP:
13534     if (pp.is_pool_snaps_mode()) {
13535       ret = -EINVAL;
13536       goto out;
13537     }
13538   }
13539
13540   switch (m->op) {
13541   case POOL_OP_CREATE_SNAP:
13542     if (!pp.snap_exists(m->name.c_str())) {
13543       pp.add_snap(m->name.c_str(), ceph_clock_now());
13544       dout(10) << "create snap in pool " << m->pool << " " << m->name
13545                << " seq " << pp.get_snap_epoch() << dendl;
13546       changed = true;
13547     }
13548     break;
13549
13550   case POOL_OP_DELETE_SNAP:
13551     {
13552       snapid_t s = pp.snap_exists(m->name.c_str());
13553       if (s) {
13554         pp.remove_snap(s);
13555         pending_inc.new_removed_snaps[m->pool].insert(s);
13556         changed = true;
13557       }
13558     }
13559     break;
13560
13561   case POOL_OP_CREATE_UNMANAGED_SNAP:
13562     {
13563       uint64_t snapid = pp.add_unmanaged_snap(
13564         osdmap.require_osd_release < ceph_release_t::octopus);
13565       encode(snapid, reply_data);
13566       changed = true;
13567     }
13568     break;
13569
13570   case POOL_OP_DELETE_UNMANAGED_SNAP:
13571     if (!_is_removed_snap(m->pool, m->snapid) &&
13572         !_is_pending_removed_snap(m->pool, m->snapid)) {
13573       if (m->snapid > pp.get_snap_seq()) {
13574         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13575         return false;
13576       }
13577       pp.remove_unmanaged_snap(
13578         m->snapid,
13579         osdmap.require_osd_release < ceph_release_t::octopus);
13580       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13581       // also record the new seq as purged: this avoids a discontinuity
13582       // after all of the snaps have been purged, since the seq assigned
13583       // during removal lives in the same namespace as the actual snaps.
13584       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13585       changed = true;
13586     }
13587     break;
13588
13589   case POOL_OP_AUID_CHANGE:
13590     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13591     return false;
13592
13593   default:
13594     ceph_abort();
13595     break;
13596   }
13597
13598   if (changed) {
13599     pp.set_snap_epoch(pending_inc.epoch);
13600     pending_inc.new_pools[m->pool] = pp;
13601   }
13602
13603  out:
13604   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13605   return true;
13606 }
13607
13608 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13609 {
13610   op->mark_osdmon_event(__func__);
13611   int err = prepare_new_pool(op);
13612   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13613   return true;
13614 }
13615
13616 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13617                                    ostream *ss)
13618 {
13619   const string& poolstr = osdmap.get_pool_name(pool_id);
13620
13621   // If the Pool is in use by CephFS, refuse to delete it
13622   FSMap const &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13623   if (pending_fsmap.pool_in_use(pool_id)) {
13624     *ss << "pool '" << poolstr << "' is in use by CephFS";
13625     return -EBUSY;
13626   }
13627
13628   if (pool.tier_of >= 0) {
13629     *ss << "pool '" << poolstr << "' is a tier of '"
13630         << osdmap.get_pool_name(pool.tier_of) << "'";
13631     return -EBUSY;
13632   }
13633   if (!pool.tiers.empty()) {
13634     *ss << "pool '" << poolstr << "' has tiers";
13635     for(auto tier : pool.tiers) {
13636       *ss << " " << osdmap.get_pool_name(tier);
13637     }
13638     return -EBUSY;
13639   }
13640
13641   if (!g_conf()->mon_allow_pool_delete) {
13642     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13643     return -EPERM;
13644   }
13645
13646   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13647     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13648     return -EPERM;
13649   }
13650
13651   *ss << "pool '" << poolstr << "' removed";
13652   return 0;
13653 }
13654
13655 /**
13656  * Check if it is safe to add a tier to a base pool
13657  *
13658  * @return
13659  * True if the operation should proceed, false if we should abort here
13660  * (abort doesn't necessarily mean error, could be idempotency)
13661  */
13662 bool OSDMonitor::_check_become_tier(
13663     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
13664     const int64_t base_pool_id, const pg_pool_t *base_pool,
13665     int *err,
13666     ostream *ss) const
13667 {
13668   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
13669   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13670
13671   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13672   if (pending_fsmap.pool_in_use(tier_pool_id)) {
13673     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
13674     *err = -EBUSY;
13675     return false;
13676   }
13677
13678   if (base_pool->tiers.count(tier_pool_id)) {
13679     ceph_assert(tier_pool->tier_of == base_pool_id);
13680     *err = 0;
13681     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
13682       << base_pool_name << "'";
13683     return false;
13684   }
13685
13686   if (base_pool->is_tier()) {
13687     *ss << "pool '" << base_pool_name << "' is already a tier of '"
13688       << osdmap.get_pool_name(base_pool->tier_of) << "', "
13689       << "multiple tiers are not yet supported.";
13690     *err = -EINVAL;
13691     return false;
13692   }
13693
13694   if (tier_pool->has_tiers()) {
13695     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
13696     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
13697          it != tier_pool->tiers.end(); ++it)
13698       *ss << "'" << osdmap.get_pool_name(*it) << "',";
13699     *ss << " multiple tiers are not yet supported.";
13700     *err = -EINVAL;
13701     return false;
13702   }
13703
13704   if (tier_pool->is_tier()) {
13705     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
13706        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
13707     *err = -EINVAL;
13708     return false;
13709   }
13710
13711   *err = 0;
13712   return true;
13713 }
13714
13715
13716 /**
13717  * Check if it is safe to remove a tier from this base pool
13718  *
13719  * @return
13720  * True if the operation should proceed, false if we should abort here
13721  * (abort doesn't necessarily mean error, could be idempotency)
13722  */
13723 bool OSDMonitor::_check_remove_tier(
13724     const int64_t base_pool_id, const pg_pool_t *base_pool,
13725     const pg_pool_t *tier_pool,
13726     int *err, ostream *ss) const
13727 {
13728   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
13729
13730   // Apply CephFS-specific checks
13731   const FSMap &pending_fsmap = mon->mdsmon()->get_pending_fsmap();
13732   if (pending_fsmap.pool_in_use(base_pool_id)) {
13733     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
13734       // If the underlying pool is erasure coded and does not allow EC
13735       // overwrites, we can't permit the removal of the replicated tier that
13736       // CephFS relies on to access it
13737       *ss << "pool '" << base_pool_name <<
13738           "' does not allow EC overwrites and is in use by CephFS"
13739           " via its tier";
13740       *err = -EBUSY;
13741       return false;
13742     }
13743
13744     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
13745       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
13746              "tier is still in use as a writeback cache.  Change the cache "
13747              "mode and flush the cache before removing it";
13748       *err = -EBUSY;
13749       return false;
13750     }
13751   }
13752
13753   *err = 0;
13754   return true;
13755 }
13756
13757 int OSDMonitor::_prepare_remove_pool(
13758   int64_t pool, ostream *ss, bool no_fake)
13759 {
13760   dout(10) << __func__ << " " << pool << dendl;
13761   const pg_pool_t *p = osdmap.get_pg_pool(pool);
13762   int r = _check_remove_pool(pool, *p, ss);
13763   if (r < 0)
13764     return r;
13765
13766   auto new_pool = pending_inc.new_pools.find(pool);
13767   if (new_pool != pending_inc.new_pools.end()) {
13768     // if there is a problem with the pending info, wait and retry
13769     // this op.
13770     const auto& p = new_pool->second;
13771     int r = _check_remove_pool(pool, p, ss);
13772     if (r < 0)
13773       return -EAGAIN;
13774   }
13775
13776   if (pending_inc.old_pools.count(pool)) {
13777     dout(10) << __func__ << " " << pool << " already pending removal"
13778              << dendl;
13779     return 0;
13780   }
13781
13782   if (g_conf()->mon_fake_pool_delete && !no_fake) {
13783     string old_name = osdmap.get_pool_name(pool);
13784     string new_name = old_name + "." + stringify(pool) + ".DELETED";
13785     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
13786             << old_name << " -> " << new_name << dendl;
13787     pending_inc.new_pool_names[pool] = new_name;
13788     return 0;
13789   }
13790
13791   // remove
13792   pending_inc.old_pools.insert(pool);
13793
13794   // remove any pg_temp mappings for this pool
13795   for (auto p = osdmap.pg_temp->begin();
13796        p != osdmap.pg_temp->end();
13797        ++p) {
13798     if (p->first.pool() == pool) {
13799       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
13800                << p->first << dendl;
13801       pending_inc.new_pg_temp[p->first].clear();
13802     }
13803   }
13804   // remove any primary_temp mappings for this pool
13805   for (auto p = osdmap.primary_temp->begin();
13806       p != osdmap.primary_temp->end();
13807       ++p) {
13808     if (p->first.pool() == pool) {
13809       dout(10) << __func__ << " " << pool
13810                << " removing obsolete primary_temp" << p->first << dendl;
13811       pending_inc.new_primary_temp[p->first] = -1;
13812     }
13813   }
13814   // remove any pg_upmap mappings for this pool
13815   for (auto& p : osdmap.pg_upmap) {
13816     if (p.first.pool() == pool) {
13817       dout(10) << __func__ << " " << pool
13818                << " removing obsolete pg_upmap "
13819                << p.first << dendl;
13820       pending_inc.old_pg_upmap.insert(p.first);
13821     }
13822   }
13823   // remove any pending pg_upmap mappings for this pool
13824   {
13825     auto it = pending_inc.new_pg_upmap.begin();
13826     while (it != pending_inc.new_pg_upmap.end()) {
13827       if (it->first.pool() == pool) {
13828         dout(10) << __func__ << " " << pool
13829                  << " removing pending pg_upmap "
13830                  << it->first << dendl;
13831         it = pending_inc.new_pg_upmap.erase(it);
13832       } else {
13833         it++;
13834       }
13835     }
13836   }
13837   // remove any pg_upmap_items mappings for this pool
13838   for (auto& p : osdmap.pg_upmap_items) {
13839     if (p.first.pool() == pool) {
13840       dout(10) << __func__ << " " << pool
13841                << " removing obsolete pg_upmap_items " << p.first
13842                << dendl;
13843       pending_inc.old_pg_upmap_items.insert(p.first);
13844     }
13845   }
13846   // remove any pending pg_upmap mappings for this pool
13847   {
13848     auto it = pending_inc.new_pg_upmap_items.begin();
13849     while (it != pending_inc.new_pg_upmap_items.end()) {
13850       if (it->first.pool() == pool) {
13851         dout(10) << __func__ << " " << pool
13852                  << " removing pending pg_upmap_items "
13853                  << it->first << dendl;
13854         it = pending_inc.new_pg_upmap_items.erase(it);
13855       } else {
13856         it++;
13857       }
13858     }
13859   }
13860
13861   // remove any choose_args for this pool
13862   CrushWrapper newcrush;
13863   _get_pending_crush(newcrush);
13864   if (newcrush.have_choose_args(pool)) {
13865     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
13866     newcrush.rm_choose_args(pool);
13867     pending_inc.crush.clear();
13868     newcrush.encode(pending_inc.crush, mon->get_quorum_con_features());
13869   }
13870   return 0;
13871 }
13872
13873 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
13874 {
13875   dout(10) << "_prepare_rename_pool " << pool << dendl;
13876   if (pending_inc.old_pools.count(pool)) {
13877     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
13878     return -ENOENT;
13879   }
13880   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
13881        p != pending_inc.new_pool_names.end();
13882        ++p) {
13883     if (p->second == newname && p->first != pool) {
13884       return -EEXIST;
13885     }
13886   }
13887
13888   pending_inc.new_pool_names[pool] = newname;
13889   return 0;
13890 }
13891
13892 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
13893 {
13894   op->mark_osdmon_event(__func__);
13895   auto m = op->get_req<MPoolOp>();
13896   ostringstream ss;
13897   int ret = _prepare_remove_pool(m->pool, &ss, false);
13898   if (ret == -EAGAIN) {
13899     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13900     return true;
13901   }
13902   if (ret < 0)
13903     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
13904   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
13905                                                       pending_inc.epoch));
13906   return true;
13907 }
13908
13909 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
13910                                 int ret, epoch_t epoch, bufferlist *blp)
13911 {
13912   op->mark_osdmon_event(__func__);
13913   auto m = op->get_req<MPoolOp>();
13914   dout(20) << "_pool_op_reply " << ret << dendl;
13915   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
13916                                          ret, epoch, get_last_committed(), blp);
13917   mon->send_reply(op, reply);
13918 }
13919
13920 void OSDMonitor::convert_pool_priorities(void)
13921 {
13922   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
13923   int64_t max_prio = 0;
13924   int64_t min_prio = 0;
13925   for (const auto &i : osdmap.get_pools()) {
13926     const auto &pool = i.second;
13927
13928     if (pool.opts.is_set(key)) {
13929       int64_t prio = 0;
13930       pool.opts.get(key, &prio);
13931       if (prio > max_prio)
13932         max_prio = prio;
13933       if (prio < min_prio)
13934         min_prio = prio;
13935     }
13936   }
13937   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
13938     dout(20) << __func__ << " nothing to fix" << dendl;
13939     return;
13940   }
13941   // Current pool priorities exceeds new maximum
13942   for (const auto &i : osdmap.get_pools()) {
13943     const auto pool_id = i.first;
13944     pg_pool_t pool = i.second;
13945
13946     int64_t prio = 0;
13947     pool.opts.get(key, &prio);
13948     int64_t n;
13949
13950     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
13951       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
13952       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
13953     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
13954       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
13955       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
13956     } else {
13957       continue;
13958     }
13959     if (n == 0) {
13960       pool.opts.unset(key);
13961     } else {
13962       pool.opts.set(key, static_cast<int64_t>(n));
13963     }
13964     dout(10) << __func__ << " pool " << pool_id
13965              << " recovery_priority adjusted "
13966              << prio << " to " << n << dendl;
13967     pool.last_change = pending_inc.epoch;
13968     pending_inc.new_pools[pool_id] = pool;
13969   }
13970 }