ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/KVMonitor.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate2.h"
  51 #include "messages/MOSDPGCreated.h"
  52 #include "messages/MOSDPGTemp.h"
  53 #include "messages/MOSDPGReadyToMerge.h"
  54 #include "messages/MMonCommand.h"
  55 #include "messages/MRemoveSnaps.h"
  56 #include "messages/MRoute.h"
  57 #include "messages/MMonGetPurgedSnaps.h"
  58 #include "messages/MMonGetPurgedSnapsReply.h"
  59
  60 #include "common/TextTable.h"
  61 #include "common/Timer.h"
  62 #include "common/ceph_argparse.h"
  63 #include "common/perf_counters.h"
  64 #include "common/PriorityCache.h"
  65 #include "common/strtol.h"
  66 #include "common/numa.h"
  67
  68 #include "common/config.h"
  69 #include "common/errno.h"
  70
  71 #include "erasure-code/ErasureCodePlugin.h"
  72 #include "compressor/Compressor.h"
  73 #include "common/Checksummer.h"
  74
  75 #include "include/compat.h"
  76 #include "include/ceph_assert.h"
  77 #include "include/stringify.h"
  78 #include "include/util.h"
  79 #include "common/cmdparse.h"
  80 #include "include/str_list.h"
  81 #include "include/str_map.h"
  82 #include "include/scope_guard.h"
  83 #include "perfglue/heap_profiler.h"
  84
  85 #include "auth/cephx/CephxKeyServer.h"
  86 #include "osd/OSDCap.h"
  87
  88 #include "json_spirit/json_spirit_reader.h"
  89
  90 #include <boost/algorithm/string/predicate.hpp>
  91
  92 using std::dec;
  93 using std::hex;
  94 using std::list;
  95 using std::map;
  96 using std::make_pair;
  97 using std::ostringstream;
  98 using std::pair;
  99 using std::set;
 100 using std::string;
 101 using std::stringstream;
 102 using std::to_string;
 103 using std::vector;
 104
 105 using ceph::bufferlist;
 106 using ceph::decode;
 107 using ceph::encode;
 108 using ceph::ErasureCodeInterfaceRef;
 109 using ceph::ErasureCodePluginRegistry;
 110 using ceph::ErasureCodeProfile;
 111 using ceph::Formatter;
 112 using ceph::JSONFormatter;
 113 using ceph::make_message;
 114
 115 #define dout_subsys ceph_subsys_mon
 116 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
 117 static const string OSD_METADATA_PREFIX("osd_metadata");
 118 static const string OSD_SNAP_PREFIX("osd_snap");
 119
 120 /*
 121
 122   OSD snapshot metadata
 123   ---------------------
 124
 125   -- starting with mimic, removed in octopus --
 126
 127   "removed_epoch_%llu_%08lx" % (pool, epoch)
 128    -> interval_set<snapid_t>
 129
 130   "removed_snap_%llu_%016llx" % (pool, last_snap)
 131    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 132
 133
 134   -- starting with mimic --
 135
 136   "purged_snap_%llu_%016llx" % (pool, last_snap)
 137    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 138
 139   - note that the {removed,purged}_snap put the last snap in they key so
 140     that we can use forward iteration only to search for an epoch in an
 141     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 142     >= N that either does or doesn't contain the given snap.
 143
 144
 145   -- starting with octopus --
 146
 147   "purged_epoch_%08lx" % epoch
 148   -> map<int64_t,interval_set<snapid_t>>
 149
 150   */
 151 using namespace TOPNSPC::common;
 152 namespace {
 153
 154 struct OSDMemCache : public PriorityCache::PriCache {
 155   OSDMonitor *osdmon;
 156   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 157   int64_t committed_bytes = 0;
 158   double cache_ratio = 0;
 159
 160   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 161
 162   virtual uint64_t _get_used_bytes() const = 0;
 163
 164   virtual int64_t request_cache_bytes(
 165       PriorityCache::Priority pri, uint64_t total_cache) const {
 166     int64_t assigned = get_cache_bytes(pri);
 167
 168     switch (pri) {
 169     // All cache items are currently set to have PRI1 priority
 170     case PriorityCache::Priority::PRI1:
 171       {
 172         int64_t request = _get_used_bytes();
 173         return (request > assigned) ? request - assigned : 0;
 174       }
 175     default:
 176       break;
 177     }
 178     return -EOPNOTSUPP;
 179   }
 180
 181   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 182       return cache_bytes[pri];
 183   }
 184
 185   virtual int64_t get_cache_bytes() const {
 186     int64_t total = 0;
 187
 188     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 189       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 190       total += get_cache_bytes(pri);
 191     }
 192     return total;
 193   }
 194
 195   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 196     cache_bytes[pri] = bytes;
 197   }
 198   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 199     cache_bytes[pri] += bytes;
 200   }
 201   virtual int64_t commit_cache_size(uint64_t total_cache) {
 202     committed_bytes = PriorityCache::get_chunk(
 203         get_cache_bytes(), total_cache);
 204     return committed_bytes;
 205   }
 206   virtual int64_t get_committed_size() const {
 207     return committed_bytes;
 208   }
 209   virtual double get_cache_ratio() const {
 210     return cache_ratio;
 211   }
 212   virtual void set_cache_ratio(double ratio) {
 213     cache_ratio = ratio;
 214   }
 215   virtual void shift_bins() {
 216   }
 217   virtual void import_bins(const std::vector<uint64_t> &bins) {
 218   }
 219   virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
 220   }
 221   virtual uint64_t get_bins(PriorityCache::Priority pri) const {
 222     return 0;
 223   }
 224
 225   virtual string get_cache_name() const = 0;
 226 };
 227
 228 struct IncCache : public OSDMemCache {
 229   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 230
 231   virtual uint64_t _get_used_bytes() const {
 232     return osdmon->inc_osd_cache.get_bytes();
 233   }
 234
 235   virtual string get_cache_name() const {
 236     return "OSDMap Inc Cache";
 237   }
 238
 239   uint64_t _get_num_osdmaps() const {
 240     return osdmon->inc_osd_cache.get_size();
 241   }
 242 };
 243
 244 struct FullCache : public OSDMemCache {
 245   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 246
 247   virtual uint64_t _get_used_bytes() const {
 248     return osdmon->full_osd_cache.get_bytes();
 249   }
 250
 251   virtual string get_cache_name() const {
 252     return "OSDMap Full Cache";
 253   }
 254
 255   uint64_t _get_num_osdmaps() const {
 256     return osdmon->full_osd_cache.get_size();
 257   }
 258 };
 259
 260 std::shared_ptr<IncCache> inc_cache;
 261 std::shared_ptr<FullCache> full_cache;
 262
 263 const uint32_t MAX_POOL_APPLICATIONS = 4;
 264 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 265 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 266
 267 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 268   // Note: this doesn't include support for the application tag match
 269   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 270     auto& match = grant.match;
 271     if (match.is_match_all()) {
 272       return true;
 273     } else if (pool_name != nullptr &&
 274                !match.pool_namespace.pool_name.empty() &&
 275                match.pool_namespace.pool_name == *pool_name) {
 276       return true;
 277     }
 278   }
 279   return false;
 280 }
 281
 282 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 283                                     const KeyServer& key_server,
 284                                     const EntityName& entity_name,
 285                                     const MonCap& mon_caps,
 286                                     const entity_addr_t& peer_socket_addr,
 287                                     const std::string* pool_name)
 288 {
 289   typedef std::map<std::string, std::string> CommandArgs;
 290
 291   if (mon_caps.is_capable(
 292         cct, entity_name, "osd",
 293         "osd pool op unmanaged-snap",
 294         (pool_name == nullptr ?
 295          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 296          CommandArgs{{"poolname", *pool_name}}),
 297         false, true, false,
 298         peer_socket_addr)) {
 299     return true;
 300   }
 301
 302   AuthCapsInfo caps_info;
 303   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 304                                    caps_info)) {
 305     dout(10) << "unable to locate OSD cap data for " << entity_name
 306              << " in auth db" << dendl;
 307     return false;
 308   }
 309
 310   string caps_str;
 311   if (caps_info.caps.length() > 0) {
 312     auto p = caps_info.caps.cbegin();
 313     try {
 314       decode(caps_str, p);
 315     } catch (const ceph::buffer::error &err) {
 316       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 317            << dendl;
 318       return false;
 319     }
 320   }
 321
 322   OSDCap osd_cap;
 323   if (!osd_cap.parse(caps_str, nullptr)) {
 324     dout(10) << "unable to parse OSD cap data for " << entity_name
 325              << " in auth db" << dendl;
 326     return false;
 327   }
 328
 329   // if the entity has write permissions in one or all pools, permit
 330   // usage of unmanaged-snapshots
 331   if (osd_cap.allow_all()) {
 332     return true;
 333   }
 334
 335   for (auto& grant : osd_cap.grants) {
 336     if (grant.profile.is_valid()) {
 337       for (auto& profile_grant : grant.profile_grants) {
 338         if (is_osd_writable(profile_grant, pool_name)) {
 339           return true;
 340         }
 341       }
 342     } else if (is_osd_writable(grant, pool_name)) {
 343       return true;
 344     }
 345   }
 346
 347   return false;
 348 }
 349
 350 } // anonymous namespace
 351
 352 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
 353                                  epoch_t last_epoch_clean)
 354 {
 355   if (ps >= pg_num) {
 356     // removed PG
 357     return;
 358   }
 359   epoch_by_pg.resize(pg_num, 0);
 360   const auto old_lec = epoch_by_pg[ps];
 361   if (old_lec >= last_epoch_clean) {
 362     // stale lec
 363     return;
 364   }
 365   epoch_by_pg[ps] = last_epoch_clean;
 366   if (last_epoch_clean < floor) {
 367     floor = last_epoch_clean;
 368   } else if (last_epoch_clean > floor) {
 369     if (old_lec == floor) {
 370       // probably should increase floor?
 371       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 372                                         std::end(epoch_by_pg));
 373       floor = *new_floor;
 374     }
 375   }
 376   if (ps != next_missing) {
 377     return;
 378   }
 379   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 380     if (epoch_by_pg[next_missing] == 0) {
 381       break;
 382     }
 383   }
 384 }
 385
 386 void LastEpochClean::remove_pool(uint64_t pool)
 387 {
 388   report_by_pool.erase(pool);
 389 }
 390
 391 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
 392                             epoch_t last_epoch_clean)
 393 {
 394   auto& lec = report_by_pool[pg.pool()];
 395   return lec.report(pg_num, pg.ps(), last_epoch_clean);
 396 }
 397
 398 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 399 {
 400   auto floor = latest.get_epoch();
 401   for (auto& pool : latest.get_pools()) {
 402     auto reported = report_by_pool.find(pool.first);
 403     if (reported == report_by_pool.end()) {
 404       return 0;
 405     }
 406     if (reported->second.next_missing < pool.second.get_pg_num()) {
 407       return 0;
 408     }
 409     if (reported->second.floor < floor) {
 410       floor = reported->second.floor;
 411     }
 412   }
 413   return floor;
 414 }
 415
 416 void LastEpochClean::dump(Formatter *f) const
 417 {
 418   f->open_array_section("per_pool");
 419
 420   for (auto& [pool, lec] : report_by_pool) {
 421     f->open_object_section("pool");
 422     f->dump_unsigned("poolid", pool);
 423     f->dump_unsigned("floor", lec.floor);
 424     f->close_section();
 425   }
 426
 427   f->close_section();
 428 }
 429
 430 class C_UpdateCreatingPGs : public Context {
 431 public:
 432   OSDMonitor *osdmon;
 433   utime_t start;
 434   epoch_t epoch;
 435   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 436     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 437   void finish(int r) override {
 438     if (r >= 0) {
 439       utime_t end = ceph_clock_now();
 440       dout(10) << "osdmap epoch " << epoch << " mapping took "
 441                << (end - start) << " seconds" << dendl;
 442       osdmon->update_creating_pgs();
 443       osdmon->check_pg_creates_subs();
 444     }
 445   }
 446 };
 447
 448 #undef dout_prefix
 449 #define dout_prefix _prefix(_dout, mon, osdmap)
 450 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
 451   return *_dout << "mon." << mon.name << "@" << mon.rank
 452                 << "(" << mon.get_state_name()
 453                 << ").osd e" << osdmap.get_epoch() << " ";
 454 }
 455
 456 OSDMonitor::OSDMonitor(
 457   CephContext *cct,
 458   Monitor &mn,
 459   Paxos &p,
 460   const string& service_name)
 461  : PaxosService(mn, p, service_name),
 462    cct(cct),
 463    inc_osd_cache(g_conf()->mon_osd_cache_size),
 464    full_osd_cache(g_conf()->mon_osd_cache_size),
 465    has_osdmap_manifest(false),
 466    mapper(mn.cct, &mn.cpu_tp)
 467 {
 468   inc_cache = std::make_shared<IncCache>(this);
 469   full_cache = std::make_shared<FullCache>(this);
 470   cct->_conf.add_observer(this);
 471   int r = _set_cache_sizes();
 472   if (r < 0) {
 473     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 474          << g_conf()->mon_osd_cache_size
 475          << ") without priority cache management"
 476          << dendl;
 477   }
 478 }
 479
 480 const char **OSDMonitor::get_tracked_conf_keys() const
 481 {
 482   static const char* KEYS[] = {
 483     "mon_memory_target",
 484     "mon_memory_autotune",
 485     "rocksdb_cache_size",
 486     NULL
 487   };
 488   return KEYS;
 489 }
 490
 491 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 492                                     const std::set<std::string> &changed)
 493 {
 494   dout(10) << __func__ << " " << changed << dendl;
 495
 496   if (changed.count("mon_memory_autotune")) {
 497     _set_cache_autotuning();
 498   }
 499   if (changed.count("mon_memory_target") ||
 500       changed.count("rocksdb_cache_size")) {
 501     int r = _update_mon_cache_settings();
 502     if (r < 0) {
 503       derr << __func__ << " mon_memory_target:"
 504            << g_conf()->mon_memory_target
 505            << " rocksdb_cache_size:"
 506            << g_conf()->rocksdb_cache_size
 507            << ". Unable to update cache size."
 508            << dendl;
 509     }
 510   }
 511 }
 512
 513 void OSDMonitor::_set_cache_autotuning()
 514 {
 515   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 516     // Disable cache autotuning
 517     std::lock_guard l(balancer_lock);
 518     pcm = nullptr;
 519   }
 520
 521   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 522     int r = register_cache_with_pcm();
 523     if (r < 0) {
 524       dout(10) << __func__
 525                << " Error while registering osdmon caches with pcm."
 526                << " Cache auto tuning not enabled."
 527                << dendl;
 528       mon_memory_autotune = false;
 529     } else {
 530       mon_memory_autotune = true;
 531     }
 532   }
 533 }
 534
 535 int OSDMonitor::_update_mon_cache_settings()
 536 {
 537   if (g_conf()->mon_memory_target <= 0 ||
 538       g_conf()->mon_memory_target < mon_memory_min ||
 539       g_conf()->rocksdb_cache_size <= 0) {
 540     return -EINVAL;
 541   }
 542
 543   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 544     derr << __func__ << " not using pcm and rocksdb" << dendl;
 545     return -EINVAL;
 546   }
 547
 548   uint64_t old_mon_memory_target = mon_memory_target;
 549   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 550
 551   // Set the new pcm memory cache sizes
 552   mon_memory_target = g_conf()->mon_memory_target;
 553   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 554
 555   uint64_t base = mon_memory_base;
 556   double fragmentation = mon_memory_fragmentation;
 557   uint64_t target = mon_memory_target;
 558   uint64_t min = mon_memory_min;
 559   uint64_t max = min;
 560
 561   uint64_t ltarget = (1.0 - fragmentation) * target;
 562   if (ltarget > base + min) {
 563     max = ltarget - base;
 564   }
 565
 566   int r = _set_cache_ratios();
 567   if (r < 0) {
 568     derr << __func__ << " Cache ratios for pcm could not be set."
 569          << " Review the kv (rocksdb) and mon_memory_target sizes."
 570          << dendl;
 571     mon_memory_target = old_mon_memory_target;
 572     rocksdb_cache_size = old_rocksdb_cache_size;
 573     return -EINVAL;
 574   }
 575
 576   if (mon_memory_autotune && pcm != nullptr) {
 577     std::lock_guard l(balancer_lock);
 578     // set pcm cache levels
 579     pcm->set_target_memory(target);
 580     pcm->set_min_memory(min);
 581     pcm->set_max_memory(max);
 582     // tune memory based on new values
 583     pcm->tune_memory();
 584     pcm->balance();
 585     _set_new_cache_sizes();
 586     dout(1) << __func__ << " Updated mon cache setting."
 587              << " target: " << target
 588              << " min: " << min
 589              << " max: " << max
 590              << dendl;
 591   }
 592   return 0;
 593 }
 594
 595 int OSDMonitor::_set_cache_sizes()
 596 {
 597   if (g_conf()->mon_memory_autotune) {
 598     // set the new osdmon cache targets to be managed by pcm
 599     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 600     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 601     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 602     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 603     mon_memory_target = g_conf()->mon_memory_target;
 604     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 605     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 606       derr << __func__ << " mon_memory_target:" << mon_memory_target
 607            << " mon_memory_min:" << mon_memory_min
 608            << ". Invalid size option(s) provided."
 609            << dendl;
 610       return -EINVAL;
 611     }
 612     // Set the initial inc and full LRU cache sizes
 613     inc_osd_cache.set_bytes(mon_memory_min);
 614     full_osd_cache.set_bytes(mon_memory_min);
 615     mon_memory_autotune = g_conf()->mon_memory_autotune;
 616   }
 617   return 0;
 618 }
 619
 620 bool OSDMonitor::_have_pending_crush()
 621 {
 622   return pending_inc.crush.length() > 0;
 623 }
 624
 625 CrushWrapper &OSDMonitor::_get_stable_crush()
 626 {
 627   return *osdmap.crush;
 628 }
 629
 630 CrushWrapper OSDMonitor::_get_pending_crush()
 631 {
 632   bufferlist bl;
 633   if (pending_inc.crush.length())
 634     bl = pending_inc.crush;
 635   else
 636     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 637
 638   auto p = bl.cbegin();
 639   CrushWrapper crush;
 640   crush.decode(p);
 641   return crush;
 642 }
 643
 644 void OSDMonitor::create_initial()
 645 {
 646   dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
 647
 648   OSDMap newmap;
 649
 650   bufferlist bl;
 651   mon.store->get("mkfs", "osdmap", bl);
 652
 653   if (bl.length()) {
 654     newmap.decode(bl);
 655     newmap.set_fsid(mon.monmap->fsid);
 656   } else {
 657     newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
 658   }
 659   newmap.set_epoch(1);
 660   newmap.created = newmap.modified = ceph_clock_now();
 661
 662   // new clusters should sort bitwise by default.
 663   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 664
 665   newmap.flags |=
 666     CEPH_OSDMAP_RECOVERY_DELETES |
 667     CEPH_OSDMAP_PURGED_SNAPDIRS |
 668     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 669   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 670   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 671   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 672   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 673   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 674   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 675
 676   // new cluster should require latest by default
 677   if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
 678     if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
 679       derr << __func__ << " mon_debug_no_require_reef and quincy=true" << dendl;
 680       newmap.require_osd_release = ceph_release_t::pacific;
 681     } else {
 682       derr << __func__ << " mon_debug_no_require_reef=true" << dendl;
 683       newmap.require_osd_release = ceph_release_t::quincy;
 684     }
 685   } else {
 686     newmap.require_osd_release = ceph_release_t::reef;
 687   }
 688
 689   ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
 690   if (!r) {
 691     ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 692   }
 693   newmap.require_min_compat_client = r;
 694
 695   // encode into pending incremental
 696   uint64_t features = newmap.get_encoding_features();
 697   newmap.encode(pending_inc.fullmap,
 698                 features | CEPH_FEATURE_RESERVED);
 699   pending_inc.full_crc = newmap.get_crc();
 700   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 701 }
 702
 703 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 704 {
 705   s.insert(service_name);
 706   s.insert(OSD_PG_CREATING_PREFIX);
 707   s.insert(OSD_METADATA_PREFIX);
 708   s.insert(OSD_SNAP_PREFIX);
 709 }
 710
 711 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 712 {
 713   // we really don't care if the version has been updated, because we may
 714   // have trimmed without having increased the last committed; yet, we may
 715   // need to update the in-memory manifest.
 716   load_osdmap_manifest();
 717
 718   version_t version = get_last_committed();
 719   if (version == osdmap.epoch)
 720     return;
 721   ceph_assert(version > osdmap.epoch);
 722
 723   dout(15) << "update_from_paxos paxos e " << version
 724            << ", my e " << osdmap.epoch << dendl;
 725
 726   int prev_num_up_osd = osdmap.num_up_osd;
 727
 728   if (mapping_job) {
 729     if (!mapping_job->is_done()) {
 730       dout(1) << __func__ << " mapping job "
 731               << mapping_job.get() << " did not complete, "
 732               << mapping_job->shards << " left, canceling" << dendl;
 733       mapping_job->abort();
 734     }
 735     mapping_job.reset();
 736   }
 737
 738   load_health();
 739
 740   /*
 741    * We will possibly have a stashed latest that *we* wrote, and we will
 742    * always be sure to have the oldest full map in the first..last range
 743    * due to encode_trim_extra(), which includes the oldest full map in the trim
 744    * transaction.
 745    *
 746    * encode_trim_extra() does not however write the full map's
 747    * version to 'full_latest'.  This is only done when we are building the
 748    * full maps from the incremental versions.  But don't panic!  We make sure
 749    * that the following conditions find whichever full map version is newer.
 750    */
 751   version_t latest_full = get_version_latest_full();
 752   if (latest_full == 0 && get_first_committed() > 1)
 753     latest_full = get_first_committed();
 754
 755   if (get_first_committed() > 1 &&
 756       latest_full < get_first_committed()) {
 757     // the monitor could be just sync'ed with its peer, and the latest_full key
 758     // is not encoded in the paxos commits in encode_pending(), so we need to
 759     // make sure we get it pointing to a proper version.
 760     version_t lc = get_last_committed();
 761     version_t fc = get_first_committed();
 762
 763     dout(10) << __func__ << " looking for valid full map in interval"
 764              << " [" << fc << ", " << lc << "]" << dendl;
 765
 766     latest_full = 0;
 767     for (version_t v = lc; v >= fc; v--) {
 768       string full_key = "full_" + stringify(v);
 769       if (mon.store->exists(get_service_name(), full_key)) {
 770         dout(10) << __func__ << " found latest full map v " << v << dendl;
 771         latest_full = v;
 772         break;
 773       }
 774     }
 775
 776     ceph_assert(latest_full > 0);
 777     auto t(std::make_shared<MonitorDBStore::Transaction>());
 778     put_version_latest_full(t, latest_full);
 779     mon.store->apply_transaction(t);
 780     dout(10) << __func__ << " updated the on-disk full map version to "
 781              << latest_full << dendl;
 782   }
 783
 784   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 785     bufferlist latest_bl;
 786     get_version_full(latest_full, latest_bl);
 787     ceph_assert(latest_bl.length() != 0);
 788     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 789     osdmap = OSDMap();
 790     osdmap.decode(latest_bl);
 791   }
 792
 793   bufferlist bl;
 794   if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 795     auto p = bl.cbegin();
 796     std::lock_guard<std::mutex> l(creating_pgs_lock);
 797     creating_pgs.decode(p);
 798     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 799             << creating_pgs.last_scan_epoch
 800             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 801   } else {
 802     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 803             << dendl;
 804   }
 805
 806   // walk through incrementals
 807   MonitorDBStore::TransactionRef t;
 808   size_t tx_size = 0;
 809   while (version > osdmap.epoch) {
 810     bufferlist inc_bl;
 811     int err = get_version(osdmap.epoch+1, inc_bl);
 812     ceph_assert(err == 0);
 813     ceph_assert(inc_bl.length());
 814     // set priority cache manager levels if the osdmap is
 815     // being populated for the first time.
 816     if (mon_memory_autotune && pcm == nullptr) {
 817       int r = register_cache_with_pcm();
 818       if (r < 0) {
 819         dout(10) << __func__
 820                  << " Error while registering osdmon caches with pcm."
 821                  << " Proceeding without cache auto tuning."
 822                  << dendl;
 823       }
 824     }
 825
 826     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 827             << dendl;
 828     OSDMap::Incremental inc(inc_bl);
 829     err = osdmap.apply_incremental(inc);
 830     ceph_assert(err == 0);
 831
 832     if (!t)
 833       t.reset(new MonitorDBStore::Transaction);
 834
 835     // Write out the full map for all past epochs.  Encode the full
 836     // map with the same features as the incremental.  If we don't
 837     // know, use the quorum features.  If we don't know those either,
 838     // encode with all features.
 839     uint64_t f = inc.encode_features;
 840     if (!f)
 841       f = mon.get_quorum_con_features();
 842     if (!f)
 843       f = -1;
 844     bufferlist full_bl;
 845     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 846     tx_size += full_bl.length();
 847
 848     bufferlist orig_full_bl;
 849     get_version_full(osdmap.epoch, orig_full_bl);
 850     if (orig_full_bl.length()) {
 851       // the primary provided the full map
 852       ceph_assert(inc.have_crc);
 853       if (inc.full_crc != osdmap.crc) {
 854         // This will happen if the mons were running mixed versions in
 855         // the past or some other circumstance made the full encoded
 856         // maps divergent.  Reloading here will bring us back into
 857         // sync with the primary for this and all future maps.  OSDs
 858         // will also be brought back into sync when they discover the
 859         // crc mismatch and request a full map from a mon.
 860         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 861              << dendl;
 862
 863         dout(20) << __func__ << " my (bad) full osdmap:\n";
 864         JSONFormatter jf(true);
 865         jf.dump_object("osdmap", osdmap);
 866         jf.flush(*_dout);
 867         *_dout << "\nhexdump:\n";
 868         full_bl.hexdump(*_dout);
 869         *_dout << dendl;
 870
 871         osdmap = OSDMap();
 872         osdmap.decode(orig_full_bl);
 873
 874         dout(20) << __func__ << " canonical full osdmap:\n";
 875         JSONFormatter jf(true);
 876         jf.dump_object("osdmap", osdmap);
 877         jf.flush(*_dout);
 878         *_dout << "\nhexdump:\n";
 879         orig_full_bl.hexdump(*_dout);
 880         *_dout << dendl;
 881       }
 882     } else {
 883       ceph_assert(!inc.have_crc);
 884       put_version_full(t, osdmap.epoch, full_bl);
 885     }
 886     put_version_latest_full(t, osdmap.epoch);
 887
 888     // share
 889     dout(1) << osdmap << dendl;
 890
 891     if (osdmap.epoch == 1) {
 892       t->erase("mkfs", "osdmap");
 893     }
 894
 895     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 896       mon.store->apply_transaction(t);
 897       t = MonitorDBStore::TransactionRef();
 898       tx_size = 0;
 899     }
 900     for (auto [osd, state] : inc.new_state) {
 901       if (state & CEPH_OSD_UP) {
 902         // could be marked up *or* down, but we're too lazy to check which
 903         last_osd_report.erase(osd);
 904       }
 905     }
 906     for (auto [osd, weight] : inc.new_weight) {
 907       if (weight == CEPH_OSD_OUT) {
 908         // manually marked out, so drop it
 909         osd_epochs.erase(osd);
 910       }
 911     }
 912   }
 913
 914   if (t) {
 915     mon.store->apply_transaction(t);
 916   }
 917
 918   bool marked_osd_down = false;
 919   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 920     if (osdmap.is_out(o))
 921       continue;
 922     auto found = down_pending_out.find(o);
 923     if (osdmap.is_down(o)) {
 924       // populate down -> out map
 925       if (found == down_pending_out.end()) {
 926         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 927         down_pending_out[o] = ceph_clock_now();
 928         marked_osd_down = true;
 929       }
 930     } else {
 931       if (found != down_pending_out.end()) {
 932         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 933         down_pending_out.erase(found);
 934       }
 935     }
 936   }
 937   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 938
 939   check_osdmap_subs();
 940   check_pg_creates_subs();
 941
 942   share_map_with_random_osd();
 943   update_logger();
 944   process_failures();
 945
 946   // make sure our feature bits reflect the latest map
 947   update_msgr_features();
 948
 949   if (!mon.is_leader()) {
 950     // will be called by on_active() on the leader, avoid doing so twice
 951     start_mapping();
 952   }
 953   if (osdmap.stretch_mode_enabled) {
 954     dout(20) << "Stretch mode enabled in this map" << dendl;
 955     mon.try_engage_stretch_mode();
 956     if (osdmap.degraded_stretch_mode) {
 957       dout(20) << "Degraded stretch mode set in this map" << dendl;
 958       if (!osdmap.recovering_stretch_mode) {
 959         mon.set_degraded_stretch_mode();
 960   dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
 961   dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
 962   dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
 963   dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
 964         if (prev_num_up_osd < osdmap.num_up_osd &&
 965             (osdmap.num_up_osd / (double)osdmap.num_osd) >
 966             cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
 967       mon.dead_mon_buckets.size() == 0) {
 968           // TODO: This works for 2-site clusters when the OSD maps are appropriately
 969           // trimmed and everything is "normal" but not if you have a lot of out OSDs
 970           // you're ignoring or in some really degenerate failure cases
 971
 972           dout(10) << "Enabling recovery stretch mode in this map" << dendl;
 973           mon.go_recovery_stretch_mode();
 974         }
 975       } else {
 976         mon.set_recovery_stretch_mode();
 977       }
 978     } else {
 979       mon.set_healthy_stretch_mode();
 980     }
 981     if (marked_osd_down &&
 982         (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
 983       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
 984       mon.maybe_go_degraded_stretch_mode();
 985     }
 986   }
 987 }
 988
 989 int OSDMonitor::register_cache_with_pcm()
 990 {
 991   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 992     derr << __func__ << " Invalid memory size specified for mon caches."
 993          << " Caches will not be auto-tuned."
 994          << dendl;
 995     return -EINVAL;
 996   }
 997   uint64_t base = mon_memory_base;
 998   double fragmentation = mon_memory_fragmentation;
 999   // For calculating total target memory, consider rocksdb cache size.
1000   uint64_t target = mon_memory_target;
1001   uint64_t min = mon_memory_min;
1002   uint64_t max = min;
1003
1004   // Apply the same logic as in bluestore to set the max amount
1005   // of memory to use for cache. Assume base memory for OSDMaps
1006   // and then add in some overhead for fragmentation.
1007   uint64_t ltarget = (1.0 - fragmentation) * target;
1008   if (ltarget > base + min) {
1009     max = ltarget - base;
1010   }
1011
1012   rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1013   if (!rocksdb_binned_kv_cache) {
1014     derr << __func__ << " not using rocksdb" << dendl;
1015     return -EINVAL;
1016   }
1017
1018   int r = _set_cache_ratios();
1019   if (r < 0) {
1020     derr << __func__ << " Cache ratios for pcm could not be set."
1021          << " Review the kv (rocksdb) and mon_memory_target sizes."
1022          << dendl;
1023     return -EINVAL;
1024   }
1025
1026   pcm = std::make_shared<PriorityCache::Manager>(
1027       cct, min, max, target, true);
1028   pcm->insert("kv", rocksdb_binned_kv_cache, true);
1029   pcm->insert("inc", inc_cache, true);
1030   pcm->insert("full", full_cache, true);
1031   dout(1) << __func__ << " pcm target: " << target
1032            << " pcm max: " << max
1033            << " pcm min: " << min
1034            << " inc_osd_cache size: " << inc_osd_cache.get_size()
1035            << dendl;
1036   return 0;
1037 }
1038
1039 int OSDMonitor::_set_cache_ratios()
1040 {
1041   double old_cache_kv_ratio = cache_kv_ratio;
1042
1043   // Set the cache ratios for kv(rocksdb), inc and full caches
1044   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1045   if (cache_kv_ratio >= 1.0) {
1046     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1047          << ") must be in range [0,<1.0]."
1048          << dendl;
1049     cache_kv_ratio = old_cache_kv_ratio;
1050     return -EINVAL;
1051   }
1052   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1053   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1054   inc_cache->set_cache_ratio(cache_inc_ratio);
1055   full_cache->set_cache_ratio(cache_full_ratio);
1056
1057   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1058            << " inc ratio " << cache_inc_ratio
1059            << " full ratio " << cache_full_ratio
1060            << dendl;
1061   return 0;
1062 }
1063
1064 void OSDMonitor::start_mapping()
1065 {
1066   // initiate mapping job
1067   if (mapping_job) {
1068     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1069              << dendl;
1070     mapping_job->abort();
1071   }
1072   if (!osdmap.get_pools().empty()) {
1073     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1074     mapping_job = mapping.start_update(osdmap, mapper,
1075                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
1076     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1077              << " at " << fin->start << dendl;
1078     mapping_job->set_finish_event(fin);
1079   } else {
1080     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1081     mapping_job = nullptr;
1082   }
1083 }
1084
1085 void OSDMonitor::update_msgr_features()
1086 {
1087   const int types[] = {
1088     entity_name_t::TYPE_OSD,
1089     entity_name_t::TYPE_CLIENT,
1090     entity_name_t::TYPE_MDS,
1091     entity_name_t::TYPE_MON
1092   };
1093   for (int type : types) {
1094     uint64_t mask;
1095     uint64_t features = osdmap.get_features(type, &mask);
1096     if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1097       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1098       ceph::net::Policy p = mon.messenger->get_policy(type);
1099       p.features_required = (p.features_required & ~mask) | features;
1100       mon.messenger->set_policy(type, p);
1101     }
1102   }
1103 }
1104
1105 void OSDMonitor::on_active()
1106 {
1107   update_logger();
1108
1109   if (mon.is_leader()) {
1110     mon.clog->debug() << "osdmap " << osdmap;
1111     if (!priority_convert) {
1112       // Only do this once at start-up
1113       convert_pool_priorities();
1114       priority_convert = true;
1115     }
1116   } else {
1117     list<MonOpRequestRef> ls;
1118     take_all_failures(ls);
1119     while (!ls.empty()) {
1120       MonOpRequestRef op = ls.front();
1121       op->mark_osdmon_event(__func__);
1122       dispatch(op);
1123       ls.pop_front();
1124     }
1125   }
1126   start_mapping();
1127 }
1128
1129 void OSDMonitor::on_restart()
1130 {
1131   last_osd_report.clear();
1132 }
1133
1134 void OSDMonitor::on_shutdown()
1135 {
1136   dout(10) << __func__ << dendl;
1137   if (mapping_job) {
1138     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1139              << dendl;
1140     mapping_job->abort();
1141   }
1142
1143   // discard failure info, waiters
1144   list<MonOpRequestRef> ls;
1145   take_all_failures(ls);
1146   ls.clear();
1147 }
1148
1149 void OSDMonitor::update_logger()
1150 {
1151   dout(10) << "update_logger" << dendl;
1152
1153   mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1154   mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1155   mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1156   mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1157 }
1158
1159 void OSDMonitor::create_pending()
1160 {
1161   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1162   pending_inc.fsid = mon.monmap->fsid;
1163   pending_metadata.clear();
1164   pending_metadata_rm.clear();
1165   pending_pseudo_purged_snaps.clear();
1166
1167   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1168
1169   // safety checks (this shouldn't really happen)
1170   {
1171     if (osdmap.backfillfull_ratio <= 0) {
1172       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1173       if (pending_inc.new_backfillfull_ratio > 1.0)
1174         pending_inc.new_backfillfull_ratio /= 100;
1175       dout(1) << __func__ << " setting backfillfull_ratio = "
1176               << pending_inc.new_backfillfull_ratio << dendl;
1177     }
1178     if (osdmap.full_ratio <= 0) {
1179       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1180       if (pending_inc.new_full_ratio > 1.0)
1181         pending_inc.new_full_ratio /= 100;
1182       dout(1) << __func__ << " setting full_ratio = "
1183               << pending_inc.new_full_ratio << dendl;
1184     }
1185     if (osdmap.nearfull_ratio <= 0) {
1186       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1187       if (pending_inc.new_nearfull_ratio > 1.0)
1188         pending_inc.new_nearfull_ratio /= 100;
1189       dout(1) << __func__ << " setting nearfull_ratio = "
1190               << pending_inc.new_nearfull_ratio << dendl;
1191     }
1192   }
1193 }
1194
1195 creating_pgs_t
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1197                                const OSDMap& nextmap)
1198 {
1199   dout(10) << __func__ << dendl;
1200   creating_pgs_t pending_creatings;
1201   {
1202     std::lock_guard<std::mutex> l(creating_pgs_lock);
1203     pending_creatings = creating_pgs;
1204   }
1205   // check for new or old pools
1206   if (pending_creatings.last_scan_epoch < inc.epoch) {
1207     unsigned queued = 0;
1208     queued += scan_for_creating_pgs(osdmap.get_pools(),
1209                                     inc.old_pools,
1210                                     inc.modified,
1211                                     &pending_creatings);
1212     queued += scan_for_creating_pgs(inc.new_pools,
1213                                     inc.old_pools,
1214                                     inc.modified,
1215                                     &pending_creatings);
1216     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1217     for (auto deleted_pool : inc.old_pools) {
1218       auto removed = pending_creatings.remove_pool(deleted_pool);
1219       dout(10) << __func__ << " " << removed
1220                << " pg removed because containing pool deleted: "
1221                << deleted_pool << dendl;
1222       last_epoch_clean.remove_pool(deleted_pool);
1223     }
1224     // pgmon updates its creating_pgs in check_osd_map() which is called by
1225     // on_active() and check_osd_map() could be delayed if lease expires, so its
1226     // creating_pgs could be stale in comparison with the one of osdmon. let's
1227     // trim them here. otherwise, they will be added back after being erased.
1228     unsigned removed = 0;
1229     for (auto& pg : pending_created_pgs) {
1230       dout(20) << __func__ << " noting created pg " << pg << dendl;
1231       pending_creatings.created_pools.insert(pg.pool());
1232       removed += pending_creatings.pgs.erase(pg);
1233     }
1234     pending_created_pgs.clear();
1235     dout(10) << __func__ << " " << removed
1236              << " pgs removed because they're created" << dendl;
1237     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1238   }
1239
1240   // filter out any pgs that shouldn't exist.
1241   {
1242     auto i = pending_creatings.pgs.begin();
1243     while (i != pending_creatings.pgs.end()) {
1244       if (!nextmap.pg_exists(i->first)) {
1245         dout(10) << __func__ << " removing pg " << i->first
1246                  << " which should not exist" << dendl;
1247         i = pending_creatings.pgs.erase(i);
1248       } else {
1249         ++i;
1250       }
1251     }
1252   }
1253
1254   // process queue
1255   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1256   const auto total = pending_creatings.pgs.size();
1257   while (pending_creatings.pgs.size() < max &&
1258          !pending_creatings.queue.empty()) {
1259     auto p = pending_creatings.queue.begin();
1260     int64_t poolid = p->first;
1261     dout(10) << __func__ << " pool " << poolid
1262              << " created " << p->second.created
1263              << " modified " << p->second.modified
1264              << " [" << p->second.start << "-" << p->second.end << ")"
1265              << dendl;
1266     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1267                                   p->second.end - p->second.start);
1268     ps_t first = p->second.start;
1269     ps_t end = first + n;
1270     for (ps_t ps = first; ps < end; ++ps) {
1271       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1272       // NOTE: use the *current* epoch as the PG creation epoch so that the
1273       // OSD does not have to generate a long set of PastIntervals.
1274       pending_creatings.pgs.emplace(
1275         pgid,
1276         creating_pgs_t::pg_create_info(inc.epoch,
1277                                        p->second.modified));
1278       dout(10) << __func__ << " adding " << pgid << dendl;
1279     }
1280     p->second.start = end;
1281     if (p->second.done()) {
1282       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1283       pending_creatings.queue.erase(p);
1284     } else {
1285       dout(10) << __func__ << " pool " << poolid
1286                << " now [" << p->second.start << "-" << p->second.end << ")"
1287                << dendl;
1288     }
1289   }
1290   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1291            << " pools" << dendl;
1292
1293   if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1294     // walk creating pgs' history and past_intervals forward
1295     for (auto& i : pending_creatings.pgs) {
1296       // this mirrors PG::start_peering_interval()
1297       pg_t pgid = i.first;
1298
1299       // this is a bit imprecise, but sufficient?
1300       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1301         const pg_pool_t *pi;
1302         bool operator()(const set<pg_shard_t> &have) const {
1303           return have.size() >= pi->min_size;
1304         }
1305         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1306       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1307
1308       vector<int> up, acting;
1309       int up_primary, acting_primary;
1310       nextmap.pg_to_up_acting_osds(
1311         pgid, &up, &up_primary, &acting, &acting_primary);
1312       if (i.second.history.epoch_created == 0) {
1313         // new pg entry, set it up
1314         i.second.up = up;
1315         i.second.acting = acting;
1316         i.second.up_primary = up_primary;
1317         i.second.acting_primary = acting_primary;
1318         i.second.history = pg_history_t(i.second.create_epoch,
1319                                         i.second.create_stamp);
1320         dout(10) << __func__ << "  pg " << pgid << " just added, "
1321                  << " up " << i.second.up
1322                  << " p " << i.second.up_primary
1323                  << " acting " << i.second.acting
1324                  << " p " << i.second.acting_primary
1325                  << " history " << i.second.history
1326                  << " past_intervals " << i.second.past_intervals
1327                  << dendl;
1328      } else {
1329         std::stringstream debug;
1330         if (PastIntervals::check_new_interval(
1331               i.second.acting_primary, acting_primary,
1332               i.second.acting, acting,
1333               i.second.up_primary, up_primary,
1334               i.second.up, up,
1335               i.second.history.same_interval_since,
1336               i.second.history.last_epoch_clean,
1337               &nextmap,
1338               &osdmap,
1339               pgid,
1340               min_size_predicate,
1341               &i.second.past_intervals,
1342               &debug)) {
1343           epoch_t e = inc.epoch;
1344           i.second.history.same_interval_since = e;
1345           if (i.second.up != up) {
1346             i.second.history.same_up_since = e;
1347           }
1348           if (i.second.acting_primary != acting_primary) {
1349             i.second.history.same_primary_since = e;
1350           }
1351           if (pgid.is_split(
1352                 osdmap.get_pg_num(pgid.pool()),
1353                 nextmap.get_pg_num(pgid.pool()),
1354                 nullptr)) {
1355             i.second.history.last_epoch_split = e;
1356           }
1357           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1358                    << " up " << i.second.up << " -> " << up
1359                    << " p " << i.second.up_primary << " -> " << up_primary
1360                    << " acting " << i.second.acting << " -> " << acting
1361                    << " p " << i.second.acting_primary << " -> "
1362                    << acting_primary
1363                    << " history " << i.second.history
1364                    << " past_intervals " << i.second.past_intervals
1365                    << dendl;
1366           dout(20) << "  debug: " << debug.str() << dendl;
1367           i.second.up = up;
1368           i.second.acting = acting;
1369           i.second.up_primary = up_primary;
1370           i.second.acting_primary = acting_primary;
1371         }
1372       }
1373     }
1374   }
1375   dout(10) << __func__
1376            << " " << (pending_creatings.pgs.size() - total)
1377            << "/" << pending_creatings.pgs.size()
1378            << " pgs added from queued pools" << dendl;
1379   return pending_creatings;
1380 }
1381
1382 void OSDMonitor::maybe_prime_pg_temp()
1383 {
1384   bool all = false;
1385   if (pending_inc.crush.length()) {
1386     dout(10) << __func__ << " new crush map, all" << dendl;
1387     all = true;
1388   }
1389
1390   if (!pending_inc.new_up_client.empty()) {
1391     dout(10) << __func__ << " new up osds, all" << dendl;
1392     all = true;
1393   }
1394
1395   // check for interesting OSDs
1396   set<int> osds;
1397   for (auto p = pending_inc.new_state.begin();
1398        !all && p != pending_inc.new_state.end();
1399        ++p) {
1400     if ((p->second & CEPH_OSD_UP) &&
1401         osdmap.is_up(p->first)) {
1402       osds.insert(p->first);
1403     }
1404   }
1405   for (auto p = pending_inc.new_weight.begin();
1406        !all && p != pending_inc.new_weight.end();
1407        ++p) {
1408     if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1409       // weight reduction
1410       osds.insert(p->first);
1411     } else {
1412       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1413                << dendl;
1414       all = true;
1415     }
1416   }
1417
1418   if (!all && osds.empty())
1419     return;
1420
1421   if (!all) {
1422     unsigned estimate =
1423       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1424     if (estimate > mapping.get_num_pgs() *
1425         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1426       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1427                << osds.size() << " osds >= "
1428                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1429                << mapping.get_num_pgs() << " pgs, all"
1430                << dendl;
1431       all = true;
1432     } else {
1433       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1434                << osds.size() << " osds" << dendl;
1435     }
1436   }
1437
1438   OSDMap next;
1439   next.deepish_copy_from(osdmap);
1440   next.apply_incremental(pending_inc);
1441
1442   if (next.get_pools().empty()) {
1443     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1444   } else if (all) {
1445     PrimeTempJob job(next, this);
1446     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1447     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1448       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1449     } else {
1450       dout(10) << __func__ << " did not finish in "
1451                << g_conf()->mon_osd_prime_pg_temp_max_time
1452                << ", stopping" << dendl;
1453       job.abort();
1454     }
1455   } else {
1456     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1457     utime_t stop = ceph_clock_now();
1458     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1459     const int chunk = 1000;
1460     int n = chunk;
1461     std::unordered_set<pg_t> did_pgs;
1462     for (auto osd : osds) {
1463       auto& pgs = mapping.get_osd_acting_pgs(osd);
1464       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1465       for (auto pgid : pgs) {
1466         if (!did_pgs.insert(pgid).second) {
1467           continue;
1468         }
1469         prime_pg_temp(next, pgid);
1470         if (--n <= 0) {
1471           n = chunk;
1472           if (ceph_clock_now() > stop) {
1473             dout(10) << __func__ << " consumed more than "
1474                      << g_conf()->mon_osd_prime_pg_temp_max_time
1475                      << " seconds, stopping"
1476                      << dendl;
1477             return;
1478           }
1479         }
1480       }
1481     }
1482   }
1483 }
1484
1485 void OSDMonitor::prime_pg_temp(
1486   const OSDMap& next,
1487   pg_t pgid)
1488 {
1489   // TODO: remove this creating_pgs direct access?
1490   if (creating_pgs.pgs.count(pgid)) {
1491     return;
1492   }
1493   if (!osdmap.pg_exists(pgid)) {
1494     return;
1495   }
1496
1497   vector<int> up, acting;
1498   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1499
1500   vector<int> next_up, next_acting;
1501   int next_up_primary, next_acting_primary;
1502   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1503                             &next_acting, &next_acting_primary);
1504   if (acting == next_acting &&
1505       !(up != acting && next_up == next_acting))
1506     return;  // no change since last epoch
1507
1508   if (acting.empty())
1509     return;  // if previously empty now we can be no worse off
1510   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1511   if (pool && acting.size() < pool->min_size)
1512     return;  // can be no worse off than before
1513
1514   if (next_up == next_acting) {
1515     acting.clear();
1516     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1517              << dendl;
1518   }
1519
1520   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1521            << " -> " << next_up << "/" << next_acting
1522            << ", priming " << acting
1523            << dendl;
1524   {
1525     std::lock_guard l(prime_pg_temp_lock);
1526     // do not touch a mapping if a change is pending
1527     pending_inc.new_pg_temp.emplace(
1528       pgid,
1529       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1530   }
1531 }
1532
1533 /**
1534  * @note receiving a transaction in this function gives a fair amount of
1535  * freedom to the service implementation if it does need it. It shouldn't.
1536  */
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1538 {
1539   dout(10) << "encode_pending e " << pending_inc.epoch
1540            << dendl;
1541
1542   if (do_prune(t)) {
1543     dout(1) << __func__ << " osdmap full prune encoded e"
1544             << pending_inc.epoch << dendl;
1545   }
1546
1547   // finalize up pending_inc
1548   pending_inc.modified = ceph_clock_now();
1549
1550   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1551   ceph_assert(r == 0);
1552
1553   if (mapping_job) {
1554     if (!mapping_job->is_done()) {
1555       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1556               << mapping_job.get() << " did not complete, "
1557               << mapping_job->shards << " left" << dendl;
1558       mapping_job->abort();
1559     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1560       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1561               << mapping_job.get() << " is prior epoch "
1562               << mapping.get_epoch() << dendl;
1563     } else {
1564       if (g_conf()->mon_osd_prime_pg_temp) {
1565         maybe_prime_pg_temp();
1566       }
1567     }
1568   } else if (g_conf()->mon_osd_prime_pg_temp) {
1569     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1570             << dendl;
1571   }
1572   mapping_job.reset();
1573
1574   // ensure we don't have blank new_state updates.  these are interrpeted as
1575   // CEPH_OSD_UP (and almost certainly not what we want!).
1576   auto p = pending_inc.new_state.begin();
1577   while (p != pending_inc.new_state.end()) {
1578     if (p->second == 0) {
1579       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1580       p = pending_inc.new_state.erase(p);
1581     } else {
1582       if (p->second & CEPH_OSD_UP) {
1583         pending_inc.new_last_up_change = pending_inc.modified;
1584       }
1585       ++p;
1586     }
1587   }
1588   if (!pending_inc.new_up_client.empty()) {
1589     pending_inc.new_last_up_change = pending_inc.modified;
1590   }
1591   for (auto& i : pending_inc.new_weight) {
1592     if (i.first >= osdmap.max_osd) {
1593       if (i.second) {
1594         // new osd is already marked in
1595         pending_inc.new_last_in_change = pending_inc.modified;
1596         break;
1597       }
1598     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1599       // existing osd marked in or out
1600       pending_inc.new_last_in_change = pending_inc.modified;
1601       break;
1602     }
1603   }
1604
1605   {
1606     OSDMap tmp;
1607     tmp.deepish_copy_from(osdmap);
1608     tmp.apply_incremental(pending_inc);
1609
1610     // clean pg_temp mappings
1611     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1612
1613     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1614     {
1615       // check every upmapped pg for now
1616       // until we could reliably identify certain cases to ignore,
1617       // which is obviously the hard part TBD..
1618       vector<pg_t> pgs_to_check;
1619       tmp.get_upmap_pgs(&pgs_to_check);
1620       if (pgs_to_check.size() <
1621           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1622         // not enough pgs, do it inline
1623         tmp.clean_pg_upmaps(cct, &pending_inc);
1624       } else {
1625         CleanUpmapJob job(cct, tmp, pending_inc);
1626         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1627         job.wait();
1628       }
1629     }
1630
1631     // update creating pgs first so that we can remove the created pgid and
1632     // process the pool flag removal below in the same osdmap epoch.
1633     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1634     bufferlist creatings_bl;
1635     uint64_t features = CEPH_FEATURES_ALL;
1636     if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1637       dout(20) << __func__ << " encoding pending pgs without octopus features"
1638                << dendl;
1639       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1640     }
1641     encode(pending_creatings, creatings_bl, features);
1642     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1643
1644     // remove any old (or incompat) POOL_CREATING flags
1645     for (auto& i : tmp.get_pools()) {
1646       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1647         // pre-nautilus OSDMaps shouldn't get this flag.
1648         if (pending_inc.new_pools.count(i.first)) {
1649           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1650         }
1651       }
1652       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1653           !pending_creatings.still_creating_pool(i.first)) {
1654         dout(10) << __func__ << " done creating pool " << i.first
1655                  << ", clearing CREATING flag" << dendl;
1656         if (pending_inc.new_pools.count(i.first) == 0) {
1657           pending_inc.new_pools[i.first] = i.second;
1658         }
1659         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1660       }
1661     }
1662
1663     // collect which pools are currently affected by
1664     // the near/backfill/full osd(s),
1665     // and set per-pool near/backfill/full flag instead
1666     set<int64_t> full_pool_ids;
1667     set<int64_t> backfillfull_pool_ids;
1668     set<int64_t> nearfull_pool_ids;
1669     tmp.get_full_pools(cct,
1670                        &full_pool_ids,
1671                        &backfillfull_pool_ids,
1672                          &nearfull_pool_ids);
1673     if (full_pool_ids.empty() ||
1674         backfillfull_pool_ids.empty() ||
1675         nearfull_pool_ids.empty()) {
1676       // normal case - no nearfull, backfillfull or full osds
1677         // try cancel any improper nearfull/backfillfull/full pool
1678         // flags first
1679       for (auto &pool: tmp.get_pools()) {
1680         auto p = pool.first;
1681         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1682             nearfull_pool_ids.empty()) {
1683           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1684                    << "'s nearfull flag" << dendl;
1685           if (pending_inc.new_pools.count(p) == 0) {
1686             // load original pool info first!
1687             pending_inc.new_pools[p] = pool.second;
1688           }
1689           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1690         }
1691         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1692             backfillfull_pool_ids.empty()) {
1693           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1694                    << "'s backfillfull flag" << dendl;
1695           if (pending_inc.new_pools.count(p) == 0) {
1696             pending_inc.new_pools[p] = pool.second;
1697           }
1698           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1699         }
1700         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1701             full_pool_ids.empty()) {
1702           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1703             // set by EQUOTA, skipping
1704             continue;
1705           }
1706           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1707                    << "'s full flag" << dendl;
1708           if (pending_inc.new_pools.count(p) == 0) {
1709             pending_inc.new_pools[p] = pool.second;
1710           }
1711           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1712         }
1713       }
1714     }
1715     if (!full_pool_ids.empty()) {
1716       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1717                << " as full" << dendl;
1718       for (auto &p: full_pool_ids) {
1719         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1720           continue;
1721         }
1722         if (pending_inc.new_pools.count(p) == 0) {
1723           pending_inc.new_pools[p] = tmp.pools[p];
1724         }
1725         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1726         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1727         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1728       }
1729       // cancel FLAG_FULL for pools which are no longer full too
1730       for (auto &pool: tmp.get_pools()) {
1731         auto p = pool.first;
1732         if (full_pool_ids.count(p)) {
1733           // skip pools we have just marked as full above
1734           continue;
1735         }
1736         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1737             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1738           // don't touch if currently is not full
1739           // or is running out of quota (and hence considered as full)
1740           continue;
1741         }
1742         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743                  << "'s full flag" << dendl;
1744         if (pending_inc.new_pools.count(p) == 0) {
1745           pending_inc.new_pools[p] = pool.second;
1746         }
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1748       }
1749     }
1750     if (!backfillfull_pool_ids.empty()) {
1751       for (auto &p: backfillfull_pool_ids) {
1752         if (full_pool_ids.count(p)) {
1753           // skip pools we have already considered as full above
1754           continue;
1755         }
1756         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1757           // make sure FLAG_FULL is truly set, so we are safe not
1758           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1760           continue;
1761         }
1762         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1763           // don't bother if pool is already marked as backfillfull
1764           continue;
1765         }
1766         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1767                  << "'s as backfillfull" << dendl;
1768         if (pending_inc.new_pools.count(p) == 0) {
1769           pending_inc.new_pools[p] = tmp.pools[p];
1770         }
1771         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1772         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1773       }
1774       // cancel FLAG_BACKFILLFULL for pools
1775       // which are no longer backfillfull too
1776       for (auto &pool: tmp.get_pools()) {
1777         auto p = pool.first;
1778         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1779           // skip pools we have just marked as backfillfull/full above
1780           continue;
1781         }
1782         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783           // and don't touch if currently is not backfillfull
1784           continue;
1785         }
1786         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1787                  << "'s backfillfull flag" << dendl;
1788         if (pending_inc.new_pools.count(p) == 0) {
1789           pending_inc.new_pools[p] = pool.second;
1790         }
1791         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1792       }
1793     }
1794     if (!nearfull_pool_ids.empty()) {
1795       for (auto &p: nearfull_pool_ids) {
1796         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1797           continue;
1798         }
1799         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1800           // make sure FLAG_FULL is truly set, so we are safe not
1801           // to set a extra (redundant) FLAG_NEARFULL flag
1802           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1803           continue;
1804         }
1805         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1806           // don't bother if pool is already marked as nearfull
1807           continue;
1808         }
1809         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1810                  << "'s as nearfull" << dendl;
1811         if (pending_inc.new_pools.count(p) == 0) {
1812           pending_inc.new_pools[p] = tmp.pools[p];
1813         }
1814         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1815       }
1816       // cancel FLAG_NEARFULL for pools
1817       // which are no longer nearfull too
1818       for (auto &pool: tmp.get_pools()) {
1819         auto p = pool.first;
1820         if (full_pool_ids.count(p) ||
1821             backfillfull_pool_ids.count(p) ||
1822             nearfull_pool_ids.count(p)) {
1823           // skip pools we have just marked as
1824           // nearfull/backfillfull/full above
1825           continue;
1826         }
1827         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1828           // and don't touch if currently is not nearfull
1829           continue;
1830         }
1831         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1832                  << "'s nearfull flag" << dendl;
1833         if (pending_inc.new_pools.count(p) == 0) {
1834           pending_inc.new_pools[p] = pool.second;
1835         }
1836         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1837       }
1838     }
1839
1840     // min_compat_client?
1841     if (!tmp.require_min_compat_client) {
1842       auto mv = tmp.get_min_compat_client();
1843       dout(1) << __func__ << " setting require_min_compat_client to currently "
1844               << "required " << mv << dendl;
1845       mon.clog->info() << "setting require_min_compat_client to currently "
1846                         << "required " << mv;
1847       pending_inc.new_require_min_compat_client = mv;
1848     }
1849
1850     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1851         tmp.require_osd_release >= ceph_release_t::nautilus) {
1852       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1853       // add creating flags?
1854       for (auto& i : tmp.get_pools()) {
1855         if (pending_creatings.still_creating_pool(i.first)) {
1856           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1857                    << dendl;
1858           if (pending_inc.new_pools.count(i.first) == 0) {
1859             pending_inc.new_pools[i.first] = i.second;
1860           }
1861           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1862         }
1863       }
1864       // adjust blocklist items to all be TYPE_ANY
1865       for (auto& i : tmp.blocklist) {
1866         auto a = i.first;
1867         a.set_type(entity_addr_t::TYPE_ANY);
1868         pending_inc.new_blocklist[a] = i.second;
1869         pending_inc.old_blocklist.push_back(i.first);
1870       }
1871     }
1872
1873     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1874         tmp.require_osd_release >= ceph_release_t::octopus) {
1875       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1876
1877       // adjust obsoleted cache modes
1878       for (auto& [poolid, pi] : tmp.pools) {
1879         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1880           if (pending_inc.new_pools.count(poolid) == 0) {
1881             pending_inc.new_pools[poolid] = pi;
1882           }
1883           dout(10) << __func__ << " switching pool " << poolid
1884                    << " cachemode from forward -> proxy" << dendl;
1885           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1886         }
1887         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1888           if (pending_inc.new_pools.count(poolid) == 0) {
1889             pending_inc.new_pools[poolid] = pi;
1890           }
1891           dout(10) << __func__ << " switching pool " << poolid
1892                    << " cachemode from readforward -> readproxy" << dendl;
1893           pending_inc.new_pools[poolid].cache_mode =
1894             pg_pool_t::CACHEMODE_READPROXY;
1895         }
1896       }
1897
1898       // clear removed_snaps for every pool
1899       for (auto& [poolid, pi] : tmp.pools) {
1900         if (pi.removed_snaps.empty()) {
1901           continue;
1902         }
1903         if (pending_inc.new_pools.count(poolid) == 0) {
1904           pending_inc.new_pools[poolid] = pi;
1905         }
1906         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1907                  << dendl;
1908         pending_inc.new_pools[poolid].removed_snaps.clear();
1909       }
1910
1911       // create a combined purged snap epoch key for all purged snaps
1912       // prior to this epoch, and store it in the current epoch (i.e.,
1913       // the last pre-octopus epoch, just prior to the one we're
1914       // encoding now).
1915       auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1916       it->lower_bound("purged_snap_");
1917       map<int64_t,snap_interval_set_t> combined;
1918       while (it->valid()) {
1919         if (it->key().find("purged_snap_") != 0) {
1920           break;
1921         }
1922         string k = it->key();
1923         long long unsigned pool;
1924         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1925         if (n != 1) {
1926           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1927         } else {
1928           bufferlist v = it->value();
1929           auto p = v.cbegin();
1930           snapid_t begin, end;
1931           ceph::decode(begin, p);
1932           ceph::decode(end, p);
1933           combined[pool].insert(begin, end - begin);
1934         }
1935         it->next();
1936       }
1937       if (!combined.empty()) {
1938         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1939         bufferlist v;
1940         ceph::encode(combined, v);
1941         t->put(OSD_SNAP_PREFIX, k, v);
1942         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1943                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1944                  << dendl;
1945       } else {
1946         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1947                  << dendl;
1948       }
1949
1950       // clean out the old removed_snap_ and removed_epoch keys
1951       // ('`' is ASCII '_' + 1)
1952       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1953       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1954     }
1955   }
1956
1957   // tell me about it
1958   for (auto i = pending_inc.new_state.begin();
1959        i != pending_inc.new_state.end();
1960        ++i) {
1961     int s = i->second ? i->second : CEPH_OSD_UP;
1962     if (s & CEPH_OSD_UP) {
1963       dout(2) << " osd." << i->first << " DOWN" << dendl;
1964       // Reset laggy parameters if failure interval exceeds a threshold.
1965       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1966       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1967         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1968         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1969           set_default_laggy_params(i->first);
1970         }
1971       }
1972     }
1973     if (s & CEPH_OSD_EXISTS)
1974       dout(2) << " osd." << i->first << " DNE" << dendl;
1975   }
1976   for (auto i = pending_inc.new_up_client.begin();
1977        i != pending_inc.new_up_client.end();
1978        ++i) {
1979     //FIXME: insert cluster addresses too
1980     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1981   }
1982   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1983        i != pending_inc.new_weight.end();
1984        ++i) {
1985     if (i->second == CEPH_OSD_OUT) {
1986       dout(2) << " osd." << i->first << " OUT" << dendl;
1987     } else if (i->second == CEPH_OSD_IN) {
1988       dout(2) << " osd." << i->first << " IN" << dendl;
1989     } else {
1990       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1991     }
1992   }
1993
1994   // features for osdmap and its incremental
1995   uint64_t features;
1996
1997   // encode full map and determine its crc
1998   OSDMap tmp;
1999   {
2000     tmp.deepish_copy_from(osdmap);
2001     tmp.apply_incremental(pending_inc);
2002
2003     // determine appropriate features
2004     features = tmp.get_encoding_features();
2005     dout(10) << __func__ << " encoding full map with "
2006              << tmp.require_osd_release
2007              << " features " << features << dendl;
2008
2009     // the features should be a subset of the mon quorum's features!
2010     ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2011
2012     bufferlist fullbl;
2013     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2014     pending_inc.full_crc = tmp.get_crc();
2015
2016     // include full map in the txn.  note that old monitors will
2017     // overwrite this.  new ones will now skip the local full map
2018     // encode and reload from this.
2019     put_version_full(t, pending_inc.epoch, fullbl);
2020   }
2021
2022   // encode
2023   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2024   bufferlist bl;
2025   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2026
2027   dout(20) << " full_crc " << tmp.get_crc()
2028            << " inc_crc " << pending_inc.inc_crc << dendl;
2029
2030   /* put everything in the transaction */
2031   put_version(t, pending_inc.epoch, bl);
2032   put_last_committed(t, pending_inc.epoch);
2033
2034   // metadata, too!
2035   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2036        p != pending_metadata.end();
2037        ++p) {
2038     Metadata m;
2039     auto mp = p->second.cbegin();
2040     decode(m, mp);
2041     auto it = m.find("osd_objectstore");
2042     if (it != m.end()) {
2043       if (it->second == "filestore") {
2044         filestore_osds.insert(p->first);
2045       } else {
2046         filestore_osds.erase(p->first);
2047       }
2048     }
2049     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2050   }
2051   for (set<int>::iterator p = pending_metadata_rm.begin();
2052        p != pending_metadata_rm.end();
2053        ++p) {
2054     filestore_osds.erase(*p);
2055     t->erase(OSD_METADATA_PREFIX, stringify(*p));
2056   }
2057   pending_metadata.clear();
2058   pending_metadata_rm.clear();
2059
2060   // purged_snaps
2061   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2062       !pending_inc.new_purged_snaps.empty()) {
2063     // all snaps purged this epoch (across all pools)
2064     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2065     bufferlist v;
2066     encode(pending_inc.new_purged_snaps, v);
2067     t->put(OSD_SNAP_PREFIX, k, v);
2068   }
2069   for (auto& i : pending_inc.new_purged_snaps) {
2070     for (auto q = i.second.begin();
2071          q != i.second.end();
2072          ++q) {
2073       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2074                                 pending_inc.epoch,
2075                                 t);
2076     }
2077   }
2078   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2079     for (auto snap : snaps) {
2080       insert_purged_snap_update(pool, snap, snap + 1,
2081                                 pending_inc.epoch,
2082                                 t);
2083     }
2084   }
2085
2086   // health
2087   health_check_map_t next;
2088   tmp.check_health(cct, &next);
2089   // OSD_FILESTORE
2090   check_for_filestore_osds(&next);
2091   encode_health(next, t);
2092 }
2093
2094 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2095 {
2096   bufferlist bl;
2097   int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2098   if (r < 0)
2099     return r;
2100   try {
2101     auto p = bl.cbegin();
2102     decode(m, p);
2103   }
2104   catch (ceph::buffer::error& e) {
2105     if (err)
2106       *err << "osd." << osd << " metadata is corrupt";
2107     return -EIO;
2108   }
2109   return 0;
2110 }
2111
2112 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2113 {
2114   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2115     if (osdmap.is_up(osd)) {
2116       map<string,string> meta;
2117       load_metadata(osd, meta, nullptr);
2118       auto p = meta.find(field);
2119       if (p == meta.end()) {
2120         (*out)["unknown"]++;
2121       } else {
2122         (*out)[p->second]++;
2123       }
2124     }
2125   }
2126 }
2127
2128 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2129 {
2130   map<string,int> by_val;
2131   count_metadata(field, &by_val);
2132   f->open_object_section(field.c_str());
2133   for (auto& p : by_val) {
2134     f->dump_int(p.first.c_str(), p.second);
2135   }
2136   f->close_section();
2137 }
2138
2139 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2140 {
2141   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2142     if (osdmap.is_up(osd)) {
2143       map<string,string> meta;
2144       load_metadata(osd, meta, nullptr);
2145       auto p = meta.find("ceph_version_short");
2146       if (p == meta.end()) continue;
2147       versions[p->second].push_back(string("osd.") + stringify(osd));
2148     }
2149   }
2150 }
2151
2152 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2153 {
2154   map<string, string> metadata;
2155   int r = load_metadata(osd, metadata, nullptr);
2156   if (r < 0)
2157     return r;
2158
2159   auto it = metadata.find("osd_objectstore");
2160   if (it == metadata.end())
2161     return -ENOENT;
2162   *type = it->second;
2163   return 0;
2164 }
2165
2166 void OSDMonitor::get_filestore_osd_list()
2167 {
2168   for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2169     string objectstore_type;
2170     int r = get_osd_objectstore_type(osd, &objectstore_type);
2171     if (r == 0 && objectstore_type == "filestore") {
2172       filestore_osds.insert(osd);
2173     }
2174   }
2175 }
2176
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2178 {
2179   if (g_conf()->mon_warn_on_filestore_osds &&
2180       filestore_osds.size() > 0) {
2181     ostringstream ss, deprecated_tip;
2182     list<string> detail;
2183     ss << filestore_osds.size()
2184        << " osd(s) "
2185        << (filestore_osds.size() == 1 ? "is" : "are")
2186        << " running Filestore";
2187     deprecated_tip << ss.str();
2188     ss << " [Deprecated]";
2189     auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2190                           filestore_osds.size());
2191     deprecated_tip << ", which has been deprecated and"
2192                    << " not been optimized for QoS"
2193                    << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194     detail.push_back(deprecated_tip.str());
2195     d.detail.swap(detail);
2196   }
2197 }
2198
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2200                                                  const pg_pool_t &pool,
2201                                                  ostream *err)
2202 {
2203   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204   // since filestore osds could always join the pool later
2205   set<int> checked_osds;
2206   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2207     vector<int> up, acting;
2208     pg_t pgid(ps, pool_id);
2209     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2210     for (int osd : up) {
2211       if (checked_osds.find(osd) != checked_osds.end())
2212         continue;
2213       string objectstore_type;
2214       int r = get_osd_objectstore_type(osd, &objectstore_type);
2215       // allow with missing metadata, e.g. due to an osd never booting yet
2216       if (r < 0 || objectstore_type == "bluestore") {
2217         checked_osds.insert(osd);
2218         continue;
2219       }
2220       *err << "osd." << osd << " uses " << objectstore_type;
2221       return false;
2222     }
2223   }
2224   return true;
2225 }
2226
2227 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2228 {
2229   map<string,string> m;
2230   if (int r = load_metadata(osd, m, err))
2231     return r;
2232   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2233     f->dump_string(p->first.c_str(), p->second);
2234   return 0;
2235 }
2236
2237 void OSDMonitor::print_nodes(Formatter *f)
2238 {
2239   // group OSDs by their hosts
2240   map<string, list<int> > osds; // hostname => osd
2241   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2242     map<string, string> m;
2243     if (load_metadata(osd, m, NULL)) {
2244       continue;
2245     }
2246     map<string, string>::iterator hostname = m.find("hostname");
2247     if (hostname == m.end()) {
2248       // not likely though
2249       continue;
2250     }
2251     osds[hostname->second].push_back(osd);
2252   }
2253
2254   dump_services(f, osds, "osd");
2255 }
2256
2257 void OSDMonitor::share_map_with_random_osd()
2258 {
2259   if (osdmap.get_num_up_osds() == 0) {
2260     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2261     return;
2262   }
2263
2264   MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2265   if (!s) {
2266     dout(10) << __func__ << " no up osd on our session map" << dendl;
2267     return;
2268   }
2269
2270   dout(10) << "committed, telling random " << s->name
2271            << " all about it" << dendl;
2272
2273   // get feature of the peer
2274   // use quorum_con_features, if it's an anonymous connection.
2275   uint64_t features = s->con_features ? s->con_features :
2276                                         mon.get_quorum_con_features();
2277   // whatev, they'll request more if they need it
2278   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2279   s->con->send_message(m);
2280   // NOTE: do *not* record osd has up to this epoch (as we do
2281   // elsewhere) as they may still need to request older values.
2282 }
2283
2284 version_t OSDMonitor::get_trim_to() const
2285 {
2286   if (mon.get_quorum().empty()) {
2287     dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2288     return 0;
2289   }
2290
2291   {
2292     std::lock_guard<std::mutex> l(creating_pgs_lock);
2293     if (!creating_pgs.pgs.empty()) {
2294       dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2295       return 0;
2296     }
2297   }
2298
2299   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2300     dout(0) << __func__
2301             << " blocking osdmap trim"
2302             << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303             << " trim_to = 0" << dendl;
2304     return 0;
2305   }
2306
2307   {
2308     epoch_t floor = get_min_last_epoch_clean();
2309     dout(10) << " min_last_epoch_clean " << floor << dendl;
2310     if (g_conf()->mon_osd_force_trim_to > 0 &&
2311         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2312       floor = g_conf()->mon_osd_force_trim_to;
2313       dout(10) << __func__
2314                << " explicit mon_osd_force_trim_to = " << floor << dendl;
2315     }
2316     unsigned min = g_conf()->mon_min_osdmap_epochs;
2317     if (floor + min > get_last_committed()) {
2318       if (min < get_last_committed())
2319         floor = get_last_committed() - min;
2320       else
2321         floor = 0;
2322     }
2323     if (floor > get_first_committed()) {
2324       dout(10) << __func__ << " trim_to = " << floor << dendl;
2325       return floor;
2326     }
2327   }
2328   dout(10) << __func__ << " trim_to = 0" << dendl;
2329   return 0;
2330 }
2331
2332 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2333 {
2334   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2335   // also scan osd epochs
2336   // don't trim past the oldest reported osd epoch
2337   for (auto [osd, epoch] : osd_epochs) {
2338     if (epoch < floor) {
2339       floor = epoch;
2340     }
2341   }
2342   return floor;
2343 }
2344
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2346                                    version_t first)
2347 {
2348   dout(10) << __func__ << " including full map for e " << first << dendl;
2349   bufferlist bl;
2350   get_version_full(first, bl);
2351   put_version_full(tx, first, bl);
2352
2353   if (has_osdmap_manifest &&
2354       first > osdmap_manifest.get_first_pinned()) {
2355     _prune_update_trimmed(tx, first);
2356   }
2357 }
2358
2359
2360 /* full osdmap prune
2361  *
2362  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2363  */
2364
2365 void OSDMonitor::load_osdmap_manifest()
2366 {
2367   bool store_has_manifest =
2368     mon.store->exists(get_service_name(), "osdmap_manifest");
2369
2370   if (!store_has_manifest) {
2371     if (!has_osdmap_manifest) {
2372       return;
2373     }
2374
2375     dout(20) << __func__
2376              << " dropping osdmap manifest from memory." << dendl;
2377     osdmap_manifest = osdmap_manifest_t();
2378     has_osdmap_manifest = false;
2379     return;
2380   }
2381
2382   dout(20) << __func__
2383            << " osdmap manifest detected in store; reload." << dendl;
2384
2385   bufferlist manifest_bl;
2386   int r = get_value("osdmap_manifest", manifest_bl);
2387   if (r < 0) {
2388     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2389     ceph_abort_msg("error reading manifest");
2390   }
2391   osdmap_manifest.decode(manifest_bl);
2392   has_osdmap_manifest = true;
2393
2394   dout(10) << __func__ << " store osdmap manifest pinned ("
2395            << osdmap_manifest.get_first_pinned()
2396            << " .. "
2397            << osdmap_manifest.get_last_pinned()
2398            << ")"
2399            << dendl;
2400 }
2401
2402 bool OSDMonitor::should_prune() const
2403 {
2404   version_t first = get_first_committed();
2405   version_t last = get_last_committed();
2406   version_t min_osdmap_epochs =
2407     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2408   version_t prune_min =
2409     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2410   version_t prune_interval =
2411     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2412   version_t last_pinned = osdmap_manifest.get_last_pinned();
2413   version_t last_to_pin = last - min_osdmap_epochs;
2414
2415   // Make it or break it constraints.
2416   //
2417   // If any of these conditions fails, we will not prune, regardless of
2418   // whether we have an on-disk manifest with an on-going pruning state.
2419   //
2420   if ((last - first) <= min_osdmap_epochs) {
2421     // between the first and last committed epochs, we don't have
2422     // enough epochs to trim, much less to prune.
2423     dout(10) << __func__
2424              << " currently holding only " << (last - first)
2425              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426              << "); do not prune."
2427              << dendl;
2428     return false;
2429
2430   } else if ((last_to_pin - first) < prune_min) {
2431     // between the first committed epoch and the last epoch we would prune,
2432     // we simply don't have enough versions over the minimum to prune maps.
2433     dout(10) << __func__
2434              << " could only prune " << (last_to_pin - first)
2435              << " epochs (" << first << ".." << last_to_pin << "), which"
2436                 " is less than the required minimum (" << prune_min << ")"
2437              << dendl;
2438     return false;
2439
2440   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2441     dout(10) << __func__
2442              << " we have pruned as far as we can; do not prune."
2443              << dendl;
2444     return false;
2445
2446   } else if (last_pinned + prune_interval > last_to_pin) {
2447     dout(10) << __func__
2448              << " not enough epochs to form an interval (last pinned: "
2449              << last_pinned << ", last to pin: "
2450              << last_to_pin << ", interval: " << prune_interval << ")"
2451              << dendl;
2452     return false;
2453   }
2454
2455   dout(15) << __func__
2456            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2457            << " lc (" << first << ".." << last << ")"
2458            << dendl;
2459   return true;
2460 }
2461
2462 void OSDMonitor::_prune_update_trimmed(
2463     MonitorDBStore::TransactionRef tx,
2464     version_t first)
2465 {
2466   dout(10) << __func__
2467            << " first " << first
2468            << " last_pinned " << osdmap_manifest.get_last_pinned()
2469            << dendl;
2470
2471   osdmap_manifest_t manifest = osdmap_manifest;
2472
2473   if (!manifest.is_pinned(first)) {
2474     manifest.pin(first);
2475   }
2476
2477   set<version_t>::iterator p_end = manifest.pinned.find(first);
2478   set<version_t>::iterator p = manifest.pinned.begin();
2479   manifest.pinned.erase(p, p_end);
2480   ceph_assert(manifest.get_first_pinned() == first);
2481
2482   if (manifest.get_last_pinned() == first+1 ||
2483       manifest.pinned.size() == 1) {
2484     // we reached the end of the line, as pinned maps go; clean up our
2485     // manifest, and let `should_prune()` decide whether we should prune
2486     // again.
2487     tx->erase(get_service_name(), "osdmap_manifest");
2488     return;
2489   }
2490
2491   bufferlist bl;
2492   manifest.encode(bl);
2493   tx->put(get_service_name(), "osdmap_manifest", bl);
2494 }
2495
2496 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2497 {
2498   dout(1) << __func__ << dendl;
2499
2500   version_t pin_first;
2501
2502   // verify constrainsts on stable in-memory state
2503   if (!has_osdmap_manifest) {
2504     // we must have never pruned, OR if we pruned the state must no longer
2505     // be relevant (i.e., the state must have been removed alongside with
2506     // the trim that *must* have removed past the last pinned map in a
2507     // previous prune).
2508     ceph_assert(osdmap_manifest.pinned.empty());
2509     ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2510     pin_first = get_first_committed();
2511
2512   } else {
2513     // we must have pruned in the past AND its state is still relevant
2514     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515     // and thus we still hold a manifest in the store).
2516     ceph_assert(!osdmap_manifest.pinned.empty());
2517     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2518     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2519
2520     dout(10) << __func__
2521              << " first_pinned " << osdmap_manifest.get_first_pinned()
2522              << " last_pinned " << osdmap_manifest.get_last_pinned()
2523              << dendl;
2524
2525     pin_first = osdmap_manifest.get_last_pinned();
2526   }
2527
2528   manifest.pin(pin_first);
2529 }
2530
2531 bool OSDMonitor::_prune_sanitize_options() const
2532 {
2533   uint64_t prune_interval =
2534     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2535   uint64_t prune_min =
2536     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2537   uint64_t txsize =
2538     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2539
2540   bool r = true;
2541
2542   if (prune_interval == 0) {
2543     derr << __func__
2544          << " prune is enabled BUT prune interval is zero; abort."
2545          << dendl;
2546     r = false;
2547   } else if (prune_interval == 1) {
2548     derr << __func__
2549          << " prune interval is equal to one, which essentially means"
2550             " no pruning; abort."
2551          << dendl;
2552     r = false;
2553   }
2554   if (prune_min == 0) {
2555     derr << __func__
2556          << " prune is enabled BUT prune min is zero; abort."
2557          << dendl;
2558     r = false;
2559   }
2560   if (prune_interval > prune_min) {
2561     derr << __func__
2562          << " impossible to ascertain proper prune interval because"
2563          << " it is greater than the minimum prune epochs"
2564          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2565          << dendl;
2566     r = false;
2567   }
2568
2569   if (txsize < prune_interval - 1) {
2570     derr << __func__
2571          << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2573          << "); abort." << dendl;
2574     r = false;
2575   }
2576   return r;
2577 }
2578
2579 bool OSDMonitor::is_prune_enabled() const {
2580   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2581 }
2582
2583 bool OSDMonitor::is_prune_supported() const {
2584   return mon.get_required_mon_features().contains_any(
2585       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2586 }
2587
2588 /** do_prune
2589  *
2590  * @returns true if has side-effects; false otherwise.
2591  */
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2593 {
2594   bool enabled = is_prune_enabled();
2595
2596   dout(1) << __func__ << " osdmap full prune "
2597           << ( enabled ? "enabled" : "disabled")
2598           << dendl;
2599
2600   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2601     return false;
2602   }
2603
2604   // we are beyond the minimum prune versions, we need to remove maps because
2605   // otherwise the store will grow unbounded and we may end up having issues
2606   // with available disk space or store hangs.
2607
2608   // we will not pin all versions. We will leave a buffer number of versions.
2609   // this allows us the monitor to trim maps without caring too much about
2610   // pinned maps, and then allow us to use another ceph-mon without these
2611   // capabilities, without having to repair the store.
2612
2613   osdmap_manifest_t manifest = osdmap_manifest;
2614
2615   version_t first = get_first_committed();
2616   version_t last = get_last_committed();
2617
2618   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2619   version_t last_pinned = manifest.get_last_pinned();
2620   uint64_t prune_interval =
2621     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2622   uint64_t txsize =
2623     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2624
2625   prune_init(manifest);
2626
2627   // we need to get rid of some osdmaps
2628
2629   dout(5) << __func__
2630           << " lc (" << first << " .. " << last << ")"
2631           << " last_pinned " << last_pinned
2632           << " interval " << prune_interval
2633           << " last_to_pin " << last_to_pin
2634           << dendl;
2635
2636   // We will be erasing maps as we go.
2637   //
2638   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2639   //
2640   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641   // we stop pruning. We could prune the maps between `next_to_pin` and
2642   // `last_to_pin`, but by not doing it we end up with neater pruned
2643   // intervals, aligned with `prune_interval`. Besides, this should not be a
2644   // problem as long as `prune_interval` is set to a sane value, instead of
2645   // hundreds or thousands of maps.
2646
2647   auto map_exists = [this](version_t v) {
2648     string k = mon.store->combine_strings("full", v);
2649     return mon.store->exists(get_service_name(), k);
2650   };
2651
2652   // 'interval' represents the number of maps from the last pinned
2653   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654   // version 11 next; all intermediate versions will be removed.
2655   //
2656   // 'txsize' represents the maximum number of versions we'll be removing in
2657   // this iteration. If 'txsize' is large enough to perform multiple passes
2658   // pinning and removing maps, we will do so; if not, we'll do at least one
2659   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660   // ensure that we never go *over* the maximum.
2661
2662   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663   uint64_t removal_interval = prune_interval - 1;
2664
2665   if (txsize < removal_interval) {
2666     dout(5) << __func__
2667             << " setting txsize to removal interval size ("
2668             << removal_interval << " versions"
2669             << dendl;
2670     txsize = removal_interval;
2671   }
2672   ceph_assert(removal_interval > 0);
2673
2674   uint64_t num_pruned = 0;
2675   while (num_pruned + removal_interval <= txsize) {
2676     last_pinned = manifest.get_last_pinned();
2677
2678     if (last_pinned + prune_interval > last_to_pin) {
2679       break;
2680     }
2681     ceph_assert(last_pinned < last_to_pin);
2682
2683     version_t next_pinned = last_pinned + prune_interval;
2684     ceph_assert(next_pinned <= last_to_pin);
2685     manifest.pin(next_pinned);
2686
2687     dout(20) << __func__
2688              << " last_pinned " << last_pinned
2689              << " next_pinned " << next_pinned
2690              << " num_pruned " << num_pruned
2691              << " removal interval (" << (last_pinned+1)
2692              << ".." << (next_pinned-1) << ")"
2693              << " txsize " << txsize << dendl;
2694
2695     ceph_assert(map_exists(last_pinned));
2696     ceph_assert(map_exists(next_pinned));
2697
2698     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2699       ceph_assert(!manifest.is_pinned(v));
2700
2701       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2702       string full_key = mon.store->combine_strings("full", v);
2703       tx->erase(get_service_name(), full_key);
2704       ++num_pruned;
2705     }
2706   }
2707
2708   ceph_assert(num_pruned > 0);
2709
2710   bufferlist bl;
2711   manifest.encode(bl);
2712   tx->put(get_service_name(), "osdmap_manifest", bl);
2713
2714   return true;
2715 }
2716
2717
2718 // -------------
2719
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2721 {
2722   op->mark_osdmon_event(__func__);
2723   Message *m = op->get_req();
2724   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2725
2726   switch (m->get_type()) {
2727     // READs
2728   case MSG_MON_COMMAND:
2729     try {
2730       return preprocess_command(op);
2731     } catch (const bad_cmd_get& e) {
2732       bufferlist bl;
2733       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2734       return true;
2735     }
2736   case CEPH_MSG_MON_GET_OSDMAP:
2737     return preprocess_get_osdmap(op);
2738
2739     // damp updates
2740   case MSG_OSD_MARK_ME_DOWN:
2741     return preprocess_mark_me_down(op);
2742   case MSG_OSD_MARK_ME_DEAD:
2743     return preprocess_mark_me_dead(op);
2744   case MSG_OSD_FULL:
2745     return preprocess_full(op);
2746   case MSG_OSD_FAILURE:
2747     return preprocess_failure(op);
2748   case MSG_OSD_BOOT:
2749     return preprocess_boot(op);
2750   case MSG_OSD_ALIVE:
2751     return preprocess_alive(op);
2752   case MSG_OSD_PG_CREATED:
2753     return preprocess_pg_created(op);
2754   case MSG_OSD_PG_READY_TO_MERGE:
2755     return preprocess_pg_ready_to_merge(op);
2756   case MSG_OSD_PGTEMP:
2757     return preprocess_pgtemp(op);
2758   case MSG_OSD_BEACON:
2759     return preprocess_beacon(op);
2760
2761   case CEPH_MSG_POOLOP:
2762     return preprocess_pool_op(op);
2763
2764   case MSG_REMOVE_SNAPS:
2765     return preprocess_remove_snaps(op);
2766
2767   case MSG_MON_GET_PURGED_SNAPS:
2768     return preprocess_get_purged_snaps(op);
2769
2770   default:
2771     ceph_abort();
2772     return true;
2773   }
2774 }
2775
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2777 {
2778   op->mark_osdmon_event(__func__);
2779   Message *m = op->get_req();
2780   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2781
2782   switch (m->get_type()) {
2783     // damp updates
2784   case MSG_OSD_MARK_ME_DOWN:
2785     return prepare_mark_me_down(op);
2786   case MSG_OSD_MARK_ME_DEAD:
2787     return prepare_mark_me_dead(op);
2788   case MSG_OSD_FULL:
2789     return prepare_full(op);
2790   case MSG_OSD_FAILURE:
2791     return prepare_failure(op);
2792   case MSG_OSD_BOOT:
2793     return prepare_boot(op);
2794   case MSG_OSD_ALIVE:
2795     return prepare_alive(op);
2796   case MSG_OSD_PG_CREATED:
2797     return prepare_pg_created(op);
2798   case MSG_OSD_PGTEMP:
2799     return prepare_pgtemp(op);
2800   case MSG_OSD_PG_READY_TO_MERGE:
2801     return prepare_pg_ready_to_merge(op);
2802   case MSG_OSD_BEACON:
2803     return prepare_beacon(op);
2804
2805   case MSG_MON_COMMAND:
2806     try {
2807       return prepare_command(op);
2808     } catch (const bad_cmd_get& e) {
2809       bufferlist bl;
2810       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2811       return true;
2812     }
2813
2814   case CEPH_MSG_POOLOP:
2815     return prepare_pool_op(op);
2816
2817   case MSG_REMOVE_SNAPS:
2818     return prepare_remove_snaps(op);
2819
2820
2821   default:
2822     ceph_abort();
2823   }
2824
2825   return false;
2826 }
2827
2828 bool OSDMonitor::should_propose(double& delay)
2829 {
2830   dout(10) << "should_propose" << dendl;
2831
2832   // if full map, propose immediately!  any subsequent changes will be clobbered.
2833   if (pending_inc.fullmap.length())
2834     return true;
2835
2836   // adjust osd weights?
2837   if (!osd_weight.empty() &&
2838       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2839     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2840     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2841     delay = 0.0;
2842     osd_weight.clear();
2843     return true;
2844   }
2845
2846   return PaxosService::should_propose(delay);
2847 }
2848
2849
2850
2851 // ---------------------------
2852 // READs
2853
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2855 {
2856   op->mark_osdmon_event(__func__);
2857   auto m = op->get_req<MMonGetOSDMap>();
2858
2859   uint64_t features = mon.get_quorum_con_features();
2860   if (op->get_session() && op->get_session()->con_features)
2861     features = op->get_session()->con_features;
2862
2863   dout(10) << __func__ << " " << *m << dendl;
2864   MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2865   epoch_t first = get_first_committed();
2866   epoch_t last = osdmap.get_epoch();
2867   int max = g_conf()->osd_map_message_max;
2868   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2869   for (epoch_t e = std::max(first, m->get_full_first());
2870        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2871        ++e, --max) {
2872     bufferlist& bl = reply->maps[e];
2873     int r = get_version_full(e, features, bl);
2874     ceph_assert(r >= 0);
2875     max_bytes -= bl.length();
2876   }
2877   for (epoch_t e = std::max(first, m->get_inc_first());
2878        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2879        ++e, --max) {
2880     bufferlist& bl = reply->incremental_maps[e];
2881     int r = get_version(e, features, bl);
2882     ceph_assert(r >= 0);
2883     max_bytes -= bl.length();
2884   }
2885   reply->cluster_osdmap_trim_lower_bound = first;
2886   reply->newest_map = last;
2887   mon.send_reply(op, reply);
2888   return true;
2889 }
2890
2891
2892 // ---------------------------
2893 // UPDATEs
2894
2895 // failure --
2896
2897 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2898   // check permissions
2899   MonSession *session = op->get_session();
2900   if (!session)
2901     return true;
2902   if (!session->is_capable("osd", MON_CAP_X)) {
2903     dout(0) << "got MOSDFailure from entity with insufficient caps "
2904             << session->caps << dendl;
2905     return true;
2906   }
2907   if (fsid != mon.monmap->fsid) {
2908     dout(0) << "check_source: on fsid " << fsid
2909             << " != " << mon.monmap->fsid << dendl;
2910     return true;
2911   }
2912   return false;
2913 }
2914
2915
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2917 {
2918   op->mark_osdmon_event(__func__);
2919   auto m = op->get_req<MOSDFailure>();
2920   // who is target_osd
2921   int badboy = m->get_target_osd();
2922
2923   // check permissions
2924   if (check_source(op, m->fsid))
2925     goto didit;
2926
2927   // first, verify the reporting host is valid
2928   if (m->get_orig_source().is_osd()) {
2929     int from = m->get_orig_source().num();
2930     if (!osdmap.exists(from) ||
2931         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2932         (osdmap.is_down(from) && m->if_osd_failed())) {
2933       dout(5) << "preprocess_failure from dead osd." << from
2934               << ", ignoring" << dendl;
2935       send_incremental(op, m->get_epoch()+1);
2936       goto didit;
2937     }
2938   }
2939
2940
2941   // weird?
2942   if (osdmap.is_down(badboy)) {
2943     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2944             << " " << m->get_target_addrs()
2945             << ", from " << m->get_orig_source() << dendl;
2946     if (m->get_epoch() < osdmap.get_epoch())
2947       send_incremental(op, m->get_epoch()+1);
2948     goto didit;
2949   }
2950   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2951     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2952             << " " << m->get_target_addrs()
2953             << " != map's " << osdmap.get_addrs(badboy)
2954             << ", from " << m->get_orig_source() << dendl;
2955     if (m->get_epoch() < osdmap.get_epoch())
2956       send_incremental(op, m->get_epoch()+1);
2957     goto didit;
2958   }
2959
2960   // already reported?
2961   if (osdmap.is_down(badboy) ||
2962       osdmap.get_up_from(badboy) > m->get_epoch()) {
2963     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2964             << " " << m->get_target_addrs()
2965             << ", from " << m->get_orig_source() << dendl;
2966     if (m->get_epoch() < osdmap.get_epoch())
2967       send_incremental(op, m->get_epoch()+1);
2968     goto didit;
2969   }
2970
2971   if (!can_mark_down(badboy)) {
2972     dout(5) << "preprocess_failure ignoring report of osd."
2973             << m->get_target_osd() << " " << m->get_target_addrs()
2974             << " from " << m->get_orig_source() << dendl;
2975     goto didit;
2976   }
2977
2978   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2979            << " " << m->get_target_addrs()
2980            << ", from " << m->get_orig_source() << dendl;
2981   return false;
2982
2983  didit:
2984   mon.no_reply(op);
2985   return true;
2986 }
2987
2988 class C_AckMarkedDown : public C_MonOp {
2989   OSDMonitor *osdmon;
2990 public:
2991   C_AckMarkedDown(
2992     OSDMonitor *osdmon,
2993     MonOpRequestRef op)
2994     : C_MonOp(op), osdmon(osdmon) {}
2995
2996   void _finish(int r) override {
2997     if (r == 0) {
2998       auto m = op->get_req<MOSDMarkMeDown>();
2999       osdmon->mon.send_reply(
3000         op,
3001         new MOSDMarkMeDown(
3002           m->fsid,
3003           m->target_osd,
3004           m->target_addrs,
3005           m->get_epoch(),
3006           false));   // ACK itself does not request an ack
3007     } else if (r == -EAGAIN) {
3008         osdmon->dispatch(op);
3009     } else {
3010         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3011     }
3012   }
3013   ~C_AckMarkedDown() override {
3014   }
3015 };
3016
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3018 {
3019   op->mark_osdmon_event(__func__);
3020   auto m = op->get_req<MOSDMarkMeDown>();
3021   int from = m->target_osd;
3022
3023   // check permissions
3024   if (check_source(op, m->fsid))
3025     goto reply;
3026
3027   // first, verify the reporting host is valid
3028   if (!m->get_orig_source().is_osd())
3029     goto reply;
3030
3031   if (!osdmap.exists(from) ||
3032       osdmap.is_down(from) ||
3033       osdmap.get_addrs(from) != m->target_addrs) {
3034     dout(5) << "preprocess_mark_me_down from dead osd."
3035             << from << ", ignoring" << dendl;
3036     send_incremental(op, m->get_epoch()+1);
3037     goto reply;
3038   }
3039
3040   // no down might be set
3041   if (!can_mark_down(from))
3042     goto reply;
3043
3044   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3045            << " " << m->target_addrs << dendl;
3046   return false;
3047
3048  reply:
3049   if (m->request_ack) {
3050     Context *c(new C_AckMarkedDown(this, op));
3051     c->complete(0);
3052   }
3053   return true;
3054 }
3055
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3057 {
3058   op->mark_osdmon_event(__func__);
3059   auto m = op->get_req<MOSDMarkMeDown>();
3060   int target_osd = m->target_osd;
3061
3062   ceph_assert(osdmap.is_up(target_osd));
3063   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3064
3065   mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3066   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3067   if (m->down_and_dead) {
3068     if (!pending_inc.new_xinfo.count(target_osd)) {
3069       pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3070     }
3071     pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3072   }
3073   if (m->request_ack)
3074     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3075   return true;
3076 }
3077
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3079 {
3080   op->mark_osdmon_event(__func__);
3081   auto m = op->get_req<MOSDMarkMeDead>();
3082   int from = m->target_osd;
3083
3084   // check permissions
3085   if (check_source(op, m->fsid)) {
3086     mon.no_reply(op);
3087     return true;
3088   }
3089
3090   // first, verify the reporting host is valid
3091   if (!m->get_orig_source().is_osd()) {
3092     mon.no_reply(op);
3093     return true;
3094   }
3095
3096   if (!osdmap.exists(from) ||
3097       !osdmap.is_down(from)) {
3098     dout(5) << __func__ << " from nonexistent or up osd." << from
3099             << ", ignoring" << dendl;
3100     send_incremental(op, m->get_epoch()+1);
3101     mon.no_reply(op);
3102     return true;
3103   }
3104
3105   return false;
3106 }
3107
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3109 {
3110   op->mark_osdmon_event(__func__);
3111   auto m = op->get_req<MOSDMarkMeDead>();
3112   int target_osd = m->target_osd;
3113
3114   ceph_assert(osdmap.is_down(target_osd));
3115
3116   mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3117                     << m->get_epoch();
3118   if (!pending_inc.new_xinfo.count(target_osd)) {
3119     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3120   }
3121   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3122   wait_for_finished_proposal(
3123     op,
3124     new LambdaContext(
3125       [op, this] (int r) {
3126         if (r >= 0) {
3127           mon.no_reply(op);       // ignore on success
3128         }
3129       }
3130       ));
3131   return true;
3132 }
3133
3134 bool OSDMonitor::can_mark_down(int i)
3135 {
3136   if (osdmap.is_nodown(i)) {
3137     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3138             << "will not mark it down" << dendl;
3139     return false;
3140   }
3141
3142   int num_osds = osdmap.get_num_osds();
3143   if (num_osds == 0) {
3144     dout(5) << __func__ << " no osds" << dendl;
3145     return false;
3146   }
3147   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3148   float up_ratio = (float)up / (float)num_osds;
3149   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3150     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3151             << g_conf()->mon_osd_min_up_ratio
3152             << ", will not mark osd." << i << " down" << dendl;
3153     return false;
3154   }
3155   return true;
3156 }
3157
3158 bool OSDMonitor::can_mark_up(int i)
3159 {
3160   if (osdmap.is_noup(i)) {
3161     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3162             << "will not mark it up" << dendl;
3163     return false;
3164   }
3165
3166   return true;
3167 }
3168
3169 /**
3170  * @note the parameter @p i apparently only exists here so we can output the
3171  *       osd's id on messages.
3172  */
3173 bool OSDMonitor::can_mark_out(int i)
3174 {
3175   if (osdmap.is_noout(i)) {
3176     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3177             << "will not mark it out" << dendl;
3178     return false;
3179   }
3180
3181   int num_osds = osdmap.get_num_osds();
3182   if (num_osds == 0) {
3183     dout(5) << __func__ << " no osds" << dendl;
3184     return false;
3185   }
3186   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3187   float in_ratio = (float)in / (float)num_osds;
3188   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3189     if (i >= 0)
3190       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191               << g_conf()->mon_osd_min_in_ratio
3192               << ", will not mark osd." << i << " out" << dendl;
3193     else
3194       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3195               << g_conf()->mon_osd_min_in_ratio
3196               << ", will not mark osds out" << dendl;
3197     return false;
3198   }
3199
3200   return true;
3201 }
3202
3203 bool OSDMonitor::can_mark_in(int i)
3204 {
3205   if (osdmap.is_noin(i)) {
3206     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3207             << "will not mark it in" << dendl;
3208     return false;
3209   }
3210
3211   return true;
3212 }
3213
3214 bool OSDMonitor::check_failures(utime_t now)
3215 {
3216   bool found_failure = false;
3217   auto p = failure_info.begin();
3218   while (p != failure_info.end()) {
3219     auto& [target_osd, fi] = *p;
3220     if (can_mark_down(target_osd) &&
3221         check_failure(now, target_osd, fi)) {
3222       found_failure = true;
3223       ++p;
3224     } else if (is_failure_stale(now, fi)) {
3225       dout(10) << " dropping stale failure_info for osd." << target_osd
3226                << " from " << fi.reporters.size() << " reporters"
3227                << dendl;
3228       p = failure_info.erase(p);
3229     } else {
3230       ++p;
3231     }
3232   }
3233   return found_failure;
3234 }
3235
3236 utime_t OSDMonitor::get_grace_time(utime_t now,
3237                                    int target_osd,
3238                                    failure_info_t& fi) const
3239 {
3240   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3241   if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3242     return orig_grace;
3243   }
3244   utime_t grace = orig_grace;
3245   double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3246   double decay_k = ::log(.5) / halflife;
3247
3248   // scale grace period based on historical probability of 'lagginess'
3249   // (false positive failures due to slowness).
3250   const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3251   const utime_t failed_for = now - fi.get_failed_since();
3252   double decay = exp((double)failed_for * decay_k);
3253   dout(20) << " halflife " << halflife << " decay_k " << decay_k
3254            << " failed_for " << failed_for << " decay " << decay << dendl;
3255   double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3256   grace += my_grace;
3257
3258   // consider the peers reporting a failure a proxy for a potential
3259   // 'subcluster' over the overall cluster that is similarly
3260   // laggy.  this is clearly not true in all cases, but will sometimes
3261   // help us localize the grace correction to a subset of the system
3262   // (say, a rack with a bad switch) that is unhappy.
3263   double peer_grace = 0;
3264   for (auto& [reporter, report] : fi.reporters) {
3265     if (osdmap.exists(reporter)) {
3266       const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3267       utime_t elapsed = now - xi.down_stamp;
3268       double decay = exp((double)elapsed * decay_k);
3269       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3270     }
3271   }
3272   peer_grace /= (double)fi.reporters.size();
3273   grace += peer_grace;
3274   dout(10) << " osd." << target_osd << " has "
3275            << fi.reporters.size() << " reporters, "
3276            << grace << " grace (" << orig_grace << " + " << my_grace
3277            << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3278            << dendl;
3279
3280   return grace;
3281 }
3282
3283 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3284 {
3285   // already pending failure?
3286   if (pending_inc.new_state.count(target_osd) &&
3287       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3288     dout(10) << " already pending failure" << dendl;
3289     return true;
3290   }
3291
3292   set<string> reporters_by_subtree;
3293   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3294   ceph_assert(fi.reporters.size());
3295   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3296     // get the parent bucket whose type matches with "reporter_subtree_level".
3297     // fall back to OSD if the level doesn't exist.
3298     if (osdmap.exists(p->first)) {
3299       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3300       if (auto iter = reporter_loc.find(reporter_subtree_level);
3301           iter == reporter_loc.end()) {
3302         reporters_by_subtree.insert("osd." + to_string(p->first));
3303       } else {
3304         reporters_by_subtree.insert(iter->second);
3305       }
3306       ++p;
3307     } else {
3308       fi.cancel_report(p->first);;
3309       p = fi.reporters.erase(p);
3310     }
3311   }
3312   if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3313     return false;
3314   }
3315   const utime_t failed_for = now - fi.get_failed_since();
3316   const utime_t grace = get_grace_time(now, target_osd, fi);
3317   if (failed_for >= grace) {
3318     dout(1) << " we have enough reporters to mark osd." << target_osd
3319             << " down" << dendl;
3320     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3321
3322     mon.clog->info() << "osd." << target_osd << " failed ("
3323                       << osdmap.crush->get_full_location_ordered_string(
3324                         target_osd)
3325                       << ") ("
3326                       << (int)reporters_by_subtree.size()
3327                       << " reporters from different "
3328                       << reporter_subtree_level << " after "
3329                       << failed_for << " >= grace " << grace << ")";
3330     return true;
3331   }
3332   return false;
3333 }
3334
3335 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3336 {
3337   // if it takes too long to either cancel the report to mark the osd down,
3338   // some reporters must have failed to cancel their reports. let's just
3339   // forget these reports.
3340   const utime_t failed_for = now - fi.get_failed_since();
3341   auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3342   auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3343   return failed_for >= (heartbeat_grace + heartbeat_stale);
3344 }
3345
3346 void OSDMonitor::force_failure(int target_osd, int by)
3347 {
3348   // already pending failure?
3349   if (pending_inc.new_state.count(target_osd) &&
3350       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3351     dout(10) << " already pending failure" << dendl;
3352     return;
3353   }
3354
3355   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3356   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3357   if (!pending_inc.new_xinfo.count(target_osd)) {
3358     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3359   }
3360   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3361
3362   mon.clog->info() << "osd." << target_osd << " failed ("
3363                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3364                     << ") (connection refused reported by osd." << by << ")";
3365   return;
3366 }
3367
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3369 {
3370   op->mark_osdmon_event(__func__);
3371   auto m = op->get_req<MOSDFailure>();
3372   dout(1) << "prepare_failure osd." << m->get_target_osd()
3373           << " " << m->get_target_addrs()
3374           << " from " << m->get_orig_source()
3375           << " is reporting failure:" << m->if_osd_failed() << dendl;
3376
3377   int target_osd = m->get_target_osd();
3378   int reporter = m->get_orig_source().num();
3379   ceph_assert(osdmap.is_up(target_osd));
3380   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3381
3382   mon.no_reply(op);
3383
3384   if (m->if_osd_failed()) {
3385     // calculate failure time
3386     utime_t now = ceph_clock_now();
3387     utime_t failed_since =
3388       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3389
3390     // add a report
3391     if (m->is_immediate()) {
3392       mon.clog->debug() << "osd." << m->get_target_osd()
3393                          << " reported immediately failed by "
3394                          << m->get_orig_source();
3395       force_failure(target_osd, reporter);
3396       return true;
3397     }
3398     mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3399                       << m->get_orig_source();
3400
3401     failure_info_t& fi = failure_info[target_osd];
3402     fi.add_report(reporter, failed_since, op);
3403     return check_failure(now, target_osd, fi);
3404   } else {
3405     // remove the report
3406     mon.clog->debug() << "osd." << m->get_target_osd()
3407                        << " failure report canceled by "
3408                        << m->get_orig_source();
3409     if (failure_info.count(target_osd)) {
3410       failure_info_t& fi = failure_info[target_osd];
3411       fi.cancel_report(reporter);
3412       if (fi.reporters.empty()) {
3413         dout(10) << " removing last failure_info for osd." << target_osd
3414                  << dendl;
3415         failure_info.erase(target_osd);
3416       } else {
3417         dout(10) << " failure_info for osd." << target_osd << " now "
3418                  << fi.reporters.size() << " reporters" << dendl;
3419       }
3420     } else {
3421       dout(10) << " no failure_info for osd." << target_osd << dendl;
3422     }
3423   }
3424
3425   return false;
3426 }
3427
3428 void OSDMonitor::process_failures()
3429 {
3430   map<int,failure_info_t>::iterator p = failure_info.begin();
3431   while (p != failure_info.end()) {
3432     if (osdmap.is_up(p->first)) {
3433       ++p;
3434     } else {
3435       dout(10) << "process_failures osd." << p->first << dendl;
3436       list<MonOpRequestRef> ls;
3437       p->second.take_report_messages(ls);
3438       failure_info.erase(p++);
3439
3440       while (!ls.empty()) {
3441         MonOpRequestRef o = ls.front();
3442         if (o) {
3443           o->mark_event(__func__);
3444           MOSDFailure *m = o->get_req<MOSDFailure>();
3445           send_latest(o, m->get_epoch());
3446           mon.no_reply(o);
3447         }
3448         ls.pop_front();
3449       }
3450     }
3451   }
3452 }
3453
3454 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3455 {
3456   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3457
3458   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3459        p != failure_info.end();
3460        ++p) {
3461     p->second.take_report_messages(ls);
3462   }
3463   failure_info.clear();
3464 }
3465
3466 int OSDMonitor::get_grace_interval_threshold()
3467 {
3468   int halflife = g_conf()->mon_osd_laggy_halflife;
3469   // Scale the halflife period (default: 1_hr) by
3470   // a factor (48) to calculate the threshold.
3471   int grace_threshold_factor = 48;
3472   return halflife * grace_threshold_factor;
3473 }
3474
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3476 {
3477   int grace_interval_threshold_secs = get_grace_interval_threshold();
3478   if (last_failed_interval > grace_interval_threshold_secs) {
3479     dout(1) << " last_failed_interval " << last_failed_interval
3480             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3481             << dendl;
3482     return true;
3483   }
3484   return false;
3485 }
3486
3487 void OSDMonitor::set_default_laggy_params(int target_osd)
3488 {
3489   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3490     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3491   }
3492   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3493   xi.down_stamp = pending_inc.modified;
3494   xi.laggy_probability = 0.0;
3495   xi.laggy_interval = 0;
3496   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3497 }
3498
3499
3500 // boot --
3501
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3503 {
3504   op->mark_osdmon_event(__func__);
3505   auto m = op->get_req<MOSDBoot>();
3506   int from = m->get_orig_source_inst().name.num();
3507
3508   // check permissions, ignore if failed (no response expected)
3509   MonSession *session = op->get_session();
3510   if (!session)
3511     goto ignore;
3512   if (!session->is_capable("osd", MON_CAP_X)) {
3513     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514             << session->caps << dendl;
3515     goto ignore;
3516   }
3517
3518   if (m->sb.cluster_fsid != mon.monmap->fsid) {
3519     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3520             << " != " << mon.monmap->fsid << dendl;
3521     goto ignore;
3522   }
3523
3524   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3525     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3526     goto ignore;
3527   }
3528
3529   ceph_assert(m->get_orig_source_inst().name.is_osd());
3530
3531   // lower bound of N-2
3532   if (!HAVE_FEATURE(m->osd_features, SERVER_PACIFIC)) {
3533     mon.clog->info() << "disallowing boot of OSD "
3534                      << m->get_orig_source_inst()
3535                      << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3536     goto ignore;
3537   }
3538
3539   // make sure osd versions do not span more than 3 releases
3540   if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3541       osdmap.require_osd_release < ceph_release_t::octopus) {
3542     mon.clog->info() << "disallowing boot of quincy+ OSD "
3543                       << m->get_orig_source_inst()
3544                       << " because require_osd_release < octopus";
3545     goto ignore;
3546   }
3547   if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
3548       osdmap.require_osd_release < ceph_release_t::pacific) {
3549     mon.clog->info() << "disallowing boot of reef+ OSD "
3550                       << m->get_orig_source_inst()
3551                       << " because require_osd_release < pacific";
3552     goto ignore;
3553   }
3554
3555   // See crimson/osd/osd.cc: OSD::_send_boot
3556   if (auto type_iter = m->metadata.find("osd_type");
3557       type_iter != m->metadata.end()) {
3558     const auto &otype = type_iter->second;
3559     // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560     if (otype == "crimson") {
3561       if (!osdmap.get_allow_crimson()) {
3562         mon.clog->info()
3563           << "Disallowing boot of crimson-osd without allow_crimson "
3564           << "OSDMap flag.  Run ceph osd set_allow_crimson to set "
3565           << "allow_crimson flag.  Note that crimson-osd is "
3566           << "considered unstable and may result in crashes or "
3567           << "data loss.  Its usage should be restricted to "
3568           << "testing and development.";
3569         goto ignore;
3570       }
3571     } else {
3572       derr << __func__ << ": osd " << m->get_orig_source_inst()
3573            << " sent non-crimson osd_type field in MOSDBoot: "
3574            << otype
3575            << " -- booting anyway"
3576            << dendl;
3577     }
3578   }
3579
3580   if (osdmap.stretch_mode_enabled &&
3581       !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3582     mon.clog->info() << "disallowing boot of OSD "
3583                       << m->get_orig_source_inst()
3584                       << " because stretch mode is on and OSD lacks support";
3585     goto ignore;
3586   }
3587
3588   // already booted?
3589   if (osdmap.is_up(from) &&
3590       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3591       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3592     // yup.
3593     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3594             << " " << m->get_orig_source_addrs()
3595             << " =~ " << osdmap.get_addrs(from) << dendl;
3596     _booted(op, false);
3597     return true;
3598   }
3599
3600   if (osdmap.exists(from) &&
3601       !osdmap.get_uuid(from).is_zero() &&
3602       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3603     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3604             << " clashes with existing osd: different fsid"
3605             << " (ours: " << osdmap.get_uuid(from)
3606             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3607     goto ignore;
3608   }
3609
3610   if (osdmap.exists(from) &&
3611       osdmap.get_info(from).up_from > m->version &&
3612       osdmap.get_most_recent_addrs(from).legacy_equals(
3613         m->get_orig_source_addrs())) {
3614     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3615     send_latest(op, m->sb.current_epoch+1);
3616     return true;
3617   }
3618
3619   // noup?
3620   if (!can_mark_up(from)) {
3621     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3622     send_latest(op, m->sb.current_epoch+1);
3623     return true;
3624   }
3625
3626   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3627   return false;
3628
3629  ignore:
3630   return true;
3631 }
3632
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3634 {
3635   op->mark_osdmon_event(__func__);
3636   auto m = op->get_req<MOSDBoot>();
3637   dout(7) << __func__ << " from " << m->get_source()
3638           << " sb " << m->sb
3639           << " client_addrs" << m->get_connection()->get_peer_addrs()
3640           << " cluster_addrs " << m->cluster_addrs
3641           << " hb_back_addrs " << m->hb_back_addrs
3642           << " hb_front_addrs " << m->hb_front_addrs
3643           << dendl;
3644
3645   ceph_assert(m->get_orig_source().is_osd());
3646   int from = m->get_orig_source().num();
3647
3648   // does this osd exist?
3649   if (from >= osdmap.get_max_osd()) {
3650     dout(1) << "boot from osd." << from << " >= max_osd "
3651             << osdmap.get_max_osd() << dendl;
3652     return false;
3653   }
3654
3655   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3656   if (pending_inc.new_state.count(from))
3657     oldstate ^= pending_inc.new_state[from];
3658
3659   // already up?  mark down first?
3660   if (osdmap.is_up(from)) {
3661     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3662             << osdmap.get_addrs(from) << dendl;
3663     // preprocess should have caught these;  if not, assert.
3664     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3665                   m->get_orig_source_addrs()) ||
3666                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3667     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3668
3669     if (pending_inc.new_state.count(from) == 0 ||
3670         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3671       // mark previous guy down
3672       pending_inc.new_state[from] = CEPH_OSD_UP;
3673     }
3674     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3675   } else if (pending_inc.new_up_client.count(from)) {
3676     // already prepared, just wait
3677     dout(7) << __func__ << " already prepared, waiting on "
3678             << m->get_orig_source_addr() << dendl;
3679     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3680   } else {
3681     // mark new guy up.
3682     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3683     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3684     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3685     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3686
3687     down_pending_out.erase(from);  // if any
3688
3689     if (m->sb.weight)
3690       osd_weight[from] = m->sb.weight;
3691
3692     // set uuid?
3693     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3694              << dendl;
3695     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3696       // preprocess should have caught this;  if not, assert.
3697       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3698       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3699     }
3700
3701     // fresh osd?
3702     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3703       const osd_info_t& i = osdmap.get_info(from);
3704       if (i.up_from > i.lost_at) {
3705         dout(10) << " fresh osd; marking lost_at too" << dendl;
3706         pending_inc.new_lost[from] = osdmap.get_epoch();
3707       }
3708     }
3709
3710     // metadata
3711     bufferlist osd_metadata;
3712     encode(m->metadata, osd_metadata);
3713     pending_metadata[from] = osd_metadata;
3714     pending_metadata_rm.erase(from);
3715
3716     // adjust last clean unmount epoch?
3717     const osd_info_t& info = osdmap.get_info(from);
3718     dout(10) << " old osd_info: " << info << dendl;
3719     if (m->sb.mounted > info.last_clean_begin ||
3720         (m->sb.mounted == info.last_clean_begin &&
3721          m->sb.clean_thru > info.last_clean_end)) {
3722       epoch_t begin = m->sb.mounted;
3723       epoch_t end = m->sb.clean_thru;
3724
3725       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3726                << "[" << info.last_clean_begin << "," << info.last_clean_end
3727                << ") -> [" << begin << "-" << end << ")"
3728                << dendl;
3729       pending_inc.new_last_clean_interval[from] =
3730         pair<epoch_t,epoch_t>(begin, end);
3731     }
3732
3733     if (pending_inc.new_xinfo.count(from) == 0)
3734       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3735     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3736     if (m->boot_epoch == 0) {
3737       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3738       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3739       dout(10) << " not laggy, new xi " << xi << dendl;
3740     } else {
3741       if (xi.down_stamp.sec()) {
3742         int interval = ceph_clock_now().sec() -
3743           xi.down_stamp.sec();
3744         if (g_conf()->mon_osd_laggy_max_interval &&
3745             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3746           interval =  g_conf()->mon_osd_laggy_max_interval;
3747         }
3748         xi.laggy_interval =
3749           interval * g_conf()->mon_osd_laggy_weight +
3750           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3751       }
3752       xi.laggy_probability =
3753         g_conf()->mon_osd_laggy_weight +
3754         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3755       dout(10) << " laggy, now xi " << xi << dendl;
3756     }
3757
3758     // set features shared by the osd
3759     if (m->osd_features)
3760       xi.features = m->osd_features;
3761     else
3762       xi.features = m->get_connection()->get_features();
3763
3764     // mark in?
3765     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3766          (oldstate & CEPH_OSD_AUTOOUT)) ||
3767         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3768         (g_conf()->mon_osd_auto_mark_in)) {
3769       if (can_mark_in(from)) {
3770         if (xi.old_weight > 0) {
3771           pending_inc.new_weight[from] = xi.old_weight;
3772           xi.old_weight = 0;
3773         } else {
3774           pending_inc.new_weight[from] = CEPH_OSD_IN;
3775         }
3776       } else {
3777         dout(7) << __func__ << " NOIN set, will not mark in "
3778                 << m->get_orig_source_addr() << dendl;
3779       }
3780     }
3781
3782     // wait
3783     wait_for_finished_proposal(op, new C_Booted(this, op));
3784   }
3785   return true;
3786 }
3787
3788 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3789 {
3790   op->mark_osdmon_event(__func__);
3791   auto m = op->get_req<MOSDBoot>();
3792   dout(7) << "_booted " << m->get_orig_source_inst()
3793           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3794
3795   if (logit) {
3796     mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3797                       << " boot";
3798   }
3799
3800   send_latest(op, m->sb.current_epoch+1);
3801 }
3802
3803
3804 // -------------
3805 // full
3806
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3808 {
3809   op->mark_osdmon_event(__func__);
3810   auto m = op->get_req<MOSDFull>();
3811   int from = m->get_orig_source().num();
3812   set<string> state;
3813   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3814
3815   // check permissions, ignore if failed
3816   MonSession *session = op->get_session();
3817   if (!session)
3818     goto ignore;
3819   if (!session->is_capable("osd", MON_CAP_X)) {
3820     dout(0) << "MOSDFull from entity with insufficient privileges:"
3821             << session->caps << dendl;
3822     goto ignore;
3823   }
3824
3825   // ignore a full message from the osd instance that already went down
3826   if (!osdmap.exists(from)) {
3827     dout(7) << __func__ << " ignoring full message from nonexistent "
3828             << m->get_orig_source_inst() << dendl;
3829     goto ignore;
3830   }
3831   if ((!osdmap.is_up(from) &&
3832        osdmap.get_most_recent_addrs(from).legacy_equals(
3833          m->get_orig_source_addrs())) ||
3834       (osdmap.is_up(from) &&
3835        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3836     dout(7) << __func__ << " ignoring full message from down "
3837             << m->get_orig_source_inst() << dendl;
3838     goto ignore;
3839   }
3840
3841   OSDMap::calc_state_set(osdmap.get_state(from), state);
3842
3843   if ((osdmap.get_state(from) & mask) == m->state) {
3844     dout(7) << __func__ << " state already " << state << " for osd." << from
3845             << " " << m->get_orig_source_inst() << dendl;
3846     _reply_map(op, m->version);
3847     goto ignore;
3848   }
3849
3850   dout(10) << __func__ << " want state " << state << " for osd." << from
3851            << " " << m->get_orig_source_inst() << dendl;
3852   return false;
3853
3854  ignore:
3855   return true;
3856 }
3857
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3859 {
3860   op->mark_osdmon_event(__func__);
3861   auto m = op->get_req<MOSDFull>();
3862   const int from = m->get_orig_source().num();
3863
3864   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3865   const unsigned want_state = m->state & mask;  // safety first
3866
3867   unsigned cur_state = osdmap.get_state(from);
3868   auto p = pending_inc.new_state.find(from);
3869   if (p != pending_inc.new_state.end()) {
3870     cur_state ^= p->second;
3871   }
3872   cur_state &= mask;
3873
3874   set<string> want_state_set, cur_state_set;
3875   OSDMap::calc_state_set(want_state, want_state_set);
3876   OSDMap::calc_state_set(cur_state, cur_state_set);
3877
3878   if (cur_state != want_state) {
3879     if (p != pending_inc.new_state.end()) {
3880       p->second &= ~mask;
3881     } else {
3882       pending_inc.new_state[from] = 0;
3883     }
3884     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3885     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3886             << " -> " << want_state_set << dendl;
3887   } else {
3888     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3889             << " = wanted " << want_state_set << ", just waiting" << dendl;
3890   }
3891
3892   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3893   return true;
3894 }
3895
3896 // -------------
3897 // alive
3898
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3900 {
3901   op->mark_osdmon_event(__func__);
3902   auto m = op->get_req<MOSDAlive>();
3903   int from = m->get_orig_source().num();
3904
3905   // check permissions, ignore if failed
3906   MonSession *session = op->get_session();
3907   if (!session)
3908     goto ignore;
3909   if (!session->is_capable("osd", MON_CAP_X)) {
3910     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911             << session->caps << dendl;
3912     goto ignore;
3913   }
3914
3915   if (!osdmap.is_up(from) ||
3916       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3917     dout(7) << "preprocess_alive ignoring alive message from down "
3918             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3919             << dendl;
3920     goto ignore;
3921   }
3922
3923   if (osdmap.get_up_thru(from) >= m->want) {
3924     // yup.
3925     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3926     _reply_map(op, m->version);
3927     return true;
3928   }
3929
3930   dout(10) << "preprocess_alive want up_thru " << m->want
3931            << " from " << m->get_orig_source_inst() << dendl;
3932   return false;
3933
3934  ignore:
3935   return true;
3936 }
3937
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3939 {
3940   op->mark_osdmon_event(__func__);
3941   auto m = op->get_req<MOSDAlive>();
3942   int from = m->get_orig_source().num();
3943
3944   if (0) {  // we probably don't care much about these
3945     mon.clog->debug() << m->get_orig_source_inst() << " alive";
3946   }
3947
3948   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3949           << " from " << m->get_orig_source_inst() << dendl;
3950
3951   update_up_thru(from, m->version); // set to the latest map the OSD has
3952   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3953   return true;
3954 }
3955
3956 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3957 {
3958   op->mark_osdmon_event(__func__);
3959   dout(7) << "_reply_map " << e
3960           << " from " << op->get_req()->get_orig_source_inst()
3961           << dendl;
3962   send_latest(op, e);
3963 }
3964
3965 // pg_created
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3967 {
3968   op->mark_osdmon_event(__func__);
3969   auto m  = op->get_req<MOSDPGCreated>();
3970   dout(10) << __func__ << " " << *m << dendl;
3971   auto session = op->get_session();
3972   mon.no_reply(op);
3973   if (!session) {
3974     dout(10) << __func__ << ": no monitor session!" << dendl;
3975     return true;
3976   }
3977   if (!session->is_capable("osd", MON_CAP_X)) {
3978     derr << __func__ << " received from entity "
3979          << "with insufficient privileges " << session->caps << dendl;
3980     return true;
3981   }
3982   // always forward the "created!" to the leader
3983   return false;
3984 }
3985
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3987 {
3988   op->mark_osdmon_event(__func__);
3989   auto m = op->get_req<MOSDPGCreated>();
3990   dout(10) << __func__ << " " << *m << dendl;
3991   auto src = m->get_orig_source();
3992   auto from = src.num();
3993   if (!src.is_osd() ||
3994       !mon.osdmon()->osdmap.is_up(from) ||
3995       !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3996         m->get_orig_source_addrs())) {
3997     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3998     return false;
3999   }
4000   pending_created_pgs.push_back(m->pgid);
4001   return true;
4002 }
4003
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
4005 {
4006   op->mark_osdmon_event(__func__);
4007   auto m = op->get_req<MOSDPGReadyToMerge>();
4008   dout(10) << __func__ << " " << *m << dendl;
4009   const pg_pool_t *pi;
4010   auto session = op->get_session();
4011   if (!session) {
4012     dout(10) << __func__ << ": no monitor session!" << dendl;
4013     goto ignore;
4014   }
4015   if (!session->is_capable("osd", MON_CAP_X)) {
4016     derr << __func__ << " received from entity "
4017          << "with insufficient privileges " << session->caps << dendl;
4018     goto ignore;
4019   }
4020   pi = osdmap.get_pg_pool(m->pgid.pool());
4021   if (!pi) {
4022     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4023     goto ignore;
4024   }
4025   if (pi->get_pg_num() <= m->pgid.ps()) {
4026     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4027     goto ignore;
4028   }
4029   if (pi->get_pg_num() != m->pgid.ps() + 1) {
4030     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4031     goto ignore;
4032   }
4033   if (pi->get_pg_num_pending() > m->pgid.ps()) {
4034     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4035     goto ignore;
4036   }
4037   return false;
4038
4039  ignore:
4040   mon.no_reply(op);
4041   return true;
4042 }
4043
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4045 {
4046   op->mark_osdmon_event(__func__);
4047   auto m  = op->get_req<MOSDPGReadyToMerge>();
4048   dout(10) << __func__ << " " << *m << dendl;
4049   pg_pool_t p;
4050   if (pending_inc.new_pools.count(m->pgid.pool()))
4051     p = pending_inc.new_pools[m->pgid.pool()];
4052   else
4053     p = *osdmap.get_pg_pool(m->pgid.pool());
4054   if (p.get_pg_num() != m->pgid.ps() + 1 ||
4055       p.get_pg_num_pending() > m->pgid.ps()) {
4056     dout(10) << __func__
4057              << " race with concurrent pg_num[_pending] update, will retry"
4058              << dendl;
4059     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4060     return true;
4061   }
4062
4063   if (m->ready) {
4064     p.dec_pg_num(m->pgid,
4065                  pending_inc.epoch,
4066                  m->source_version,
4067                  m->target_version,
4068                  m->last_epoch_started,
4069                  m->last_epoch_clean);
4070     p.last_change = pending_inc.epoch;
4071   } else {
4072     // back off the merge attempt!
4073     p.set_pg_num_pending(p.get_pg_num());
4074   }
4075
4076   // force pre-nautilus clients to resend their ops, since they
4077   // don't understand pg_num_pending changes form a new interval
4078   p.last_force_op_resend_prenautilus = pending_inc.epoch;
4079
4080   pending_inc.new_pools[m->pgid.pool()] = p;
4081
4082   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4083   if (m->ready &&
4084       prob > 0 &&
4085       prob > (double)(rand() % 1000)/1000.0) {
4086     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4087     auto n = new MMonCommand(mon.monmap->get_fsid());
4088     n->set_connection(m->get_connection());
4089     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090                osdmap.get_pool_name(m->pgid.pool()) +
4091                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092                stringify(m->pgid.ps() + 1) + "\"}" };
4093     MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4094     nop->set_type_service();
4095     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4096   } else {
4097     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4098   }
4099   return true;
4100 }
4101
4102
4103 // -------------
4104 // pg_temp changes
4105
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4107 {
4108   auto m = op->get_req<MOSDPGTemp>();
4109   dout(10) << "preprocess_pgtemp " << *m << dendl;
4110   mempool::osdmap::vector<int> empty;
4111   int from = m->get_orig_source().num();
4112   size_t ignore_cnt = 0;
4113
4114   // check caps
4115   MonSession *session = op->get_session();
4116   if (!session)
4117     goto ignore;
4118   if (!session->is_capable("osd", MON_CAP_X)) {
4119     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120             << session->caps << dendl;
4121     goto ignore;
4122   }
4123
4124   if (!osdmap.is_up(from) ||
4125       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4126     dout(7) << "ignoring pgtemp message from down "
4127             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4128             << dendl;
4129     goto ignore;
4130   }
4131
4132   if (m->forced) {
4133     return false;
4134   }
4135
4136   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4137     dout(20) << " " << p->first
4138              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4139              << " -> " << p->second << dendl;
4140
4141     // does the pool exist?
4142     if (!osdmap.have_pg_pool(p->first.pool())) {
4143       /*
4144        * 1. If the osdmap does not have the pool, it means the pool has been
4145        *    removed in-between the osd sending this message and us handling it.
4146        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147        *    not exist in the pending either, as the osds would not send a
4148        *    message about a pool they know nothing about (yet).
4149        * 3. However, if the pool does exist in the pending, then it must be a
4150        *    new pool, and not relevant to this message (see 1).
4151        */
4152       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4153                << ": pool has been removed" << dendl;
4154       ignore_cnt++;
4155       continue;
4156     }
4157
4158     int acting_primary = -1;
4159     osdmap.pg_to_up_acting_osds(
4160       p->first, nullptr, nullptr, nullptr, &acting_primary);
4161     if (acting_primary != from) {
4162       /* If the source isn't the primary based on the current osdmap, we know
4163        * that the interval changed and that we can discard this message.
4164        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165        * which of two pg temp mappings on the same pg is more recent.
4166        */
4167       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4168                << ": primary has changed" << dendl;
4169       ignore_cnt++;
4170       continue;
4171     }
4172
4173     // removal?
4174     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4175                               osdmap.primary_temp->count(p->first)))
4176       return false;
4177     // change?
4178     //  NOTE: we assume that this will clear pg_primary, so consider
4179     //        an existing pg_primary field to imply a change
4180     if (p->second.size() &&
4181         (osdmap.pg_temp->count(p->first) == 0 ||
4182          osdmap.pg_temp->get(p->first) != p->second ||
4183          osdmap.primary_temp->count(p->first)))
4184       return false;
4185   }
4186
4187   // should we ignore all the pgs?
4188   if (ignore_cnt == m->pg_temp.size())
4189     goto ignore;
4190
4191   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4192   _reply_map(op, m->map_epoch);
4193   return true;
4194
4195  ignore:
4196   mon.no_reply(op);
4197   return true;
4198 }
4199
4200 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4201 {
4202   epoch_t old_up_thru = osdmap.get_up_thru(from);
4203   auto ut = pending_inc.new_up_thru.find(from);
4204   if (ut != pending_inc.new_up_thru.end()) {
4205     old_up_thru = ut->second;
4206   }
4207   if (up_thru > old_up_thru) {
4208     // set up_thru too, so the osd doesn't have to ask again
4209     pending_inc.new_up_thru[from] = up_thru;
4210   }
4211 }
4212
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4214 {
4215   op->mark_osdmon_event(__func__);
4216   auto m = op->get_req<MOSDPGTemp>();
4217   int from = m->get_orig_source().num();
4218   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4219   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4220     uint64_t pool = p->first.pool();
4221     if (pending_inc.old_pools.count(pool)) {
4222       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4223                << ": pool pending removal" << dendl;
4224       continue;
4225     }
4226     if (!osdmap.have_pg_pool(pool)) {
4227       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4228                << ": pool has been removed" << dendl;
4229       continue;
4230     }
4231     pending_inc.new_pg_temp[p->first] =
4232       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4233
4234     // unconditionally clear pg_primary (until this message can encode
4235     // a change for that, too.. at which point we need to also fix
4236     // preprocess_pg_temp)
4237     if (osdmap.primary_temp->count(p->first) ||
4238         pending_inc.new_primary_temp.count(p->first))
4239       pending_inc.new_primary_temp[p->first] = -1;
4240   }
4241
4242   // set up_thru too, so the osd doesn't have to ask again
4243   update_up_thru(from, m->map_epoch);
4244
4245   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4246   return true;
4247 }
4248
4249
4250 // ---
4251
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4253 {
4254   op->mark_osdmon_event(__func__);
4255   auto m = op->get_req<MRemoveSnaps>();
4256   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4257
4258   // check privilege, ignore if failed
4259   MonSession *session = op->get_session();
4260   mon.no_reply(op);
4261   if (!session)
4262     goto ignore;
4263   if (!session->caps.is_capable(
4264         cct,
4265         session->entity_name,
4266         "osd", "osd pool rmsnap", {}, true, true, false,
4267         session->get_peer_socket_addr())) {
4268     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269             << session->caps << dendl;
4270     goto ignore;
4271   }
4272
4273   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4274        q != m->snaps.end();
4275        ++q) {
4276     if (!osdmap.have_pg_pool(q->first)) {
4277       dout(10) << " ignoring removed_snaps " << q->second
4278                << " on non-existent pool " << q->first << dendl;
4279       continue;
4280     }
4281     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4282     for (vector<snapid_t>::iterator p = q->second.begin();
4283          p != q->second.end();
4284          ++p) {
4285       if (*p > pi->get_snap_seq() ||
4286           !_is_removed_snap(q->first, *p)) {
4287         return false;
4288       }
4289     }
4290   }
4291
4292   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4293     auto reply = make_message<MRemoveSnaps>();
4294     reply->snaps = m->snaps;
4295     mon.send_reply(op, reply.detach());
4296   }
4297
4298  ignore:
4299   return true;
4300 }
4301
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4303 {
4304   op->mark_osdmon_event(__func__);
4305   auto m = op->get_req<MRemoveSnaps>();
4306   dout(7) << "prepare_remove_snaps " << *m << dendl;
4307
4308   for (auto& [pool, snaps] : m->snaps) {
4309     if (!osdmap.have_pg_pool(pool)) {
4310       dout(10) << " ignoring removed_snaps " << snaps
4311                << " on non-existent pool " << pool << dendl;
4312       continue;
4313     }
4314
4315     pg_pool_t& pi = osdmap.pools[pool];
4316     for (auto s : snaps) {
4317       if (!_is_removed_snap(pool, s) &&
4318           (!pending_inc.new_pools.count(pool) ||
4319            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4320           (!pending_inc.new_removed_snaps.count(pool) ||
4321            !pending_inc.new_removed_snaps[pool].contains(s))) {
4322         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4323         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4324           newpi->removed_snaps.insert(s);
4325           dout(10) << " pool " << pool << " removed_snaps added " << s
4326                    << " (now " << newpi->removed_snaps << ")" << dendl;
4327         }
4328         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4329         if (s > newpi->get_snap_seq()) {
4330           dout(10) << " pool " << pool << " snap_seq "
4331                    << newpi->get_snap_seq() << " -> " << s << dendl;
4332           newpi->set_snap_seq(s);
4333         }
4334         newpi->set_snap_epoch(pending_inc.epoch);
4335         dout(10) << " added pool " << pool << " snap " << s
4336                  << " to removed_snaps queue" << dendl;
4337         pending_inc.new_removed_snaps[pool].insert(s);
4338       }
4339     }
4340   }
4341
4342   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4343     auto reply = make_message<MRemoveSnaps>();
4344     reply->snaps = m->snaps;
4345     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4346   }
4347
4348   return true;
4349 }
4350
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4352 {
4353   op->mark_osdmon_event(__func__);
4354   auto m = op->get_req<MMonGetPurgedSnaps>();
4355   dout(7) << __func__ << " " << *m << dendl;
4356
4357   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4358
4359   string k = make_purged_snap_epoch_key(m->start);
4360   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4361   it->upper_bound(k);
4362   unsigned long epoch = m->last;
4363   while (it->valid()) {
4364     if (it->key().find("purged_epoch_") != 0) {
4365       break;
4366     }
4367     string k = it->key();
4368     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4369     if (n != 1) {
4370       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4371     } else if (epoch > m->last) {
4372       break;
4373     } else {
4374       bufferlist bl = it->value();
4375       auto p = bl.cbegin();
4376       auto &v = r[epoch];
4377       try {
4378         ceph::decode(v, p);
4379       } catch (ceph::buffer::error& e) {
4380         derr << __func__ << " unable to parse value for key '" << it->key()
4381              << "': \n";
4382         bl.hexdump(*_dout);
4383         *_dout << dendl;
4384       }
4385       n += 4 + v.size() * 16;
4386     }
4387     if (n > 1048576) {
4388       // impose a semi-arbitrary limit to message size
4389       break;
4390     }
4391     it->next();
4392   }
4393
4394   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4395   reply->purged_snaps.swap(r);
4396   mon.send_reply(op, reply.detach());
4397
4398   return true;
4399 }
4400
4401 // osd beacon
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4403 {
4404   op->mark_osdmon_event(__func__);
4405   // check caps
4406   auto session = op->get_session();
4407   mon.no_reply(op);
4408   if (!session) {
4409     dout(10) << __func__ << " no monitor session!" << dendl;
4410     return true;
4411   }
4412   if (!session->is_capable("osd", MON_CAP_X)) {
4413     derr << __func__ << " received from entity "
4414          << "with insufficient privileges " << session->caps << dendl;
4415     return true;
4416   }
4417   // Always forward the beacon to the leader, even if they are the same as
4418   // the old one. The leader will mark as down osds that haven't sent
4419   // beacon for a few minutes.
4420   return false;
4421 }
4422
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4424 {
4425   op->mark_osdmon_event(__func__);
4426   const auto beacon = op->get_req<MOSDBeacon>();
4427   const auto src = beacon->get_orig_source();
4428   dout(10) << __func__ << " " << *beacon
4429            << " from " << src << dendl;
4430   int from = src.num();
4431
4432   if (!src.is_osd() ||
4433       !osdmap.is_up(from) ||
4434       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4435     if (src.is_osd() && !osdmap.is_up(from)) {
4436       // share some new maps with this guy in case it may not be
4437       // aware of its own deadness...
4438       send_latest(op, beacon->version+1);
4439     }
4440     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4441     return false;
4442   }
4443
4444   last_osd_report[from].first = ceph_clock_now();
4445   last_osd_report[from].second = beacon->osd_beacon_report_interval;
4446   osd_epochs[from] = beacon->version;
4447
4448   for (const auto& pg : beacon->pgs) {
4449     if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4450       unsigned pg_num = pool->get_pg_num();
4451       last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4452     }
4453   }
4454
4455   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4456       beacon->last_purged_snaps_scrub) {
4457     if (pending_inc.new_xinfo.count(from) == 0) {
4458       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4459     }
4460     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4461       beacon->last_purged_snaps_scrub;
4462     return true;
4463   } else {
4464     return false;
4465   }
4466 }
4467
4468 // ---------------
4469 // map helpers
4470
4471 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4472 {
4473   op->mark_osdmon_event(__func__);
4474   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4475           << " start " << start << dendl;
4476   if (start == 0)
4477     send_full(op);
4478   else
4479     send_incremental(op, start);
4480 }
4481
4482
4483 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4484 {
4485   MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4486   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4487   r->cluster_osdmap_trim_lower_bound = get_first_committed();
4488   r->newest_map = osdmap.get_epoch();
4489   return r;
4490 }
4491
4492 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4493 {
4494   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4495            << std::hex << features << std::dec << dendl;
4496   MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4497   m->cluster_osdmap_trim_lower_bound = get_first_committed();
4498   m->newest_map = osdmap.get_epoch();
4499
4500   for (epoch_t e = to; e >= from && e > 0; e--) {
4501     bufferlist bl;
4502     int err = get_version(e, features, bl);
4503     if (err == 0) {
4504       ceph_assert(bl.length());
4505       // if (get_version(e, bl) > 0) {
4506       dout(20) << "build_incremental    inc " << e << " "
4507                << bl.length() << " bytes" << dendl;
4508       m->incremental_maps[e] = bl;
4509     } else {
4510       ceph_assert(err == -ENOENT);
4511       ceph_assert(!bl.length());
4512       get_version_full(e, features, bl);
4513       if (bl.length() > 0) {
4514       //else if (get_version("full", e, bl) > 0) {
4515       dout(20) << "build_incremental   full " << e << " "
4516                << bl.length() << " bytes" << dendl;
4517       m->maps[e] = bl;
4518       } else {
4519         ceph_abort();  // we should have all maps.
4520       }
4521     }
4522   }
4523   return m;
4524 }
4525
4526 void OSDMonitor::send_full(MonOpRequestRef op)
4527 {
4528   op->mark_osdmon_event(__func__);
4529   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4530   mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4531 }
4532
4533 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4534 {
4535   op->mark_osdmon_event(__func__);
4536
4537   MonSession *s = op->get_session();
4538   ceph_assert(s);
4539
4540   if (s->proxy_con) {
4541     // oh, we can tell the other mon to do it
4542     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4543              << first << dendl;
4544     MRoute *r = new MRoute(s->proxy_tid, NULL);
4545     r->send_osdmap_first = first;
4546     s->proxy_con->send_message(r);
4547     op->mark_event("reply: send routed send_osdmap_first reply");
4548   } else {
4549     // do it ourselves
4550     send_incremental(first, s, false, op);
4551   }
4552 }
4553
4554 void OSDMonitor::send_incremental(epoch_t first,
4555                                   MonSession *session,
4556                                   bool onetime,
4557                                   MonOpRequestRef req)
4558 {
4559   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4560           << " to " << session->name << dendl;
4561
4562   // get feature of the peer
4563   // use quorum_con_features, if it's an anonymous connection.
4564   uint64_t features = session->con_features ? session->con_features :
4565     mon.get_quorum_con_features();
4566
4567   if (first <= session->osd_epoch) {
4568     dout(10) << __func__ << " " << session->name << " should already have epoch "
4569              << session->osd_epoch << dendl;
4570     first = session->osd_epoch + 1;
4571   }
4572
4573   if (first < get_first_committed()) {
4574     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4575     m->cluster_osdmap_trim_lower_bound = get_first_committed();
4576     m->newest_map = osdmap.get_epoch();
4577
4578     first = get_first_committed();
4579     bufferlist bl;
4580     int err = get_version_full(first, features, bl);
4581     ceph_assert(err == 0);
4582     ceph_assert(bl.length());
4583     dout(20) << "send_incremental starting with base full "
4584              << first << " " << bl.length() << " bytes" << dendl;
4585     m->maps[first] = bl;
4586
4587     if (req) {
4588       mon.send_reply(req, m);
4589       session->osd_epoch = first;
4590       return;
4591     } else {
4592       session->con->send_message(m);
4593       session->osd_epoch = first;
4594     }
4595     first++;
4596   }
4597
4598   while (first <= osdmap.get_epoch()) {
4599     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4600                                      osdmap.get_epoch());
4601     MOSDMap *m = build_incremental(first, last, features);
4602
4603     if (req) {
4604       // send some maps.  it may not be all of them, but it will get them
4605       // started.
4606       mon.send_reply(req, m);
4607     } else {
4608       session->con->send_message(m);
4609       first = last + 1;
4610     }
4611     session->osd_epoch = last;
4612     if (onetime || req)
4613       break;
4614   }
4615 }
4616
4617 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4618 {
4619   return get_version(ver, mon.get_quorum_con_features(), bl);
4620 }
4621
4622 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4623 {
4624   OSDMap::Incremental inc;
4625   auto q = bl.cbegin();
4626   inc.decode(q);
4627   // always encode with subset of osdmap's canonical features
4628   uint64_t f = features & inc.encode_features;
4629   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4630            << dendl;
4631   bl.clear();
4632   if (inc.fullmap.length()) {
4633     // embedded full map?
4634     OSDMap m;
4635     m.decode(inc.fullmap);
4636     inc.fullmap.clear();
4637     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4638   }
4639   if (inc.crush.length()) {
4640     // embedded crush map
4641     CrushWrapper c;
4642     auto p = inc.crush.cbegin();
4643     c.decode(p);
4644     inc.crush.clear();
4645     c.encode(inc.crush, f);
4646   }
4647   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4648 }
4649
4650 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4651 {
4652   OSDMap m;
4653   auto q = bl.cbegin();
4654   m.decode(q);
4655   // always encode with subset of osdmap's canonical features
4656   uint64_t f = features & m.get_encoding_features();
4657   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4658            << dendl;
4659   bl.clear();
4660   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4661 }
4662
4663 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4664 {
4665   uint64_t significant_features = OSDMap::get_significant_features(features);
4666   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4667     return 0;
4668   }
4669   int ret = PaxosService::get_version(ver, bl);
4670   if (ret < 0) {
4671     return ret;
4672   }
4673   // NOTE: this check is imprecise; the OSDMap encoding features may
4674   // be a subset of the latest mon quorum features, but worst case we
4675   // reencode once and then cache the (identical) result under both
4676   // feature masks.
4677   if (significant_features !=
4678       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4679     reencode_incremental_map(bl, features);
4680   }
4681   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4682   return 0;
4683 }
4684
4685 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4686 {
4687   bufferlist inc_bl;
4688   int err = get_version(ver, inc_bl);
4689   ceph_assert(err == 0);
4690   ceph_assert(inc_bl.length());
4691
4692   auto p = inc_bl.cbegin();
4693   inc.decode(p);
4694   dout(10) << __func__ << "     "
4695            << " epoch " << inc.epoch
4696            << " inc_crc " << inc.inc_crc
4697            << " full_crc " << inc.full_crc
4698            << " encode_features " << inc.encode_features << dendl;
4699   return 0;
4700 }
4701
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4703 {
4704   dout(10) << __func__ << " ver " << ver << dendl;
4705
4706   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4707   if (closest_pinned == 0) {
4708     return -ENOENT;
4709   }
4710   if (closest_pinned > ver) {
4711     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4712   }
4713   ceph_assert(closest_pinned <= ver);
4714
4715   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4716
4717   // get osdmap incremental maps and apply on top of this one.
4718   bufferlist osdm_bl;
4719   bool has_cached_osdmap = false;
4720   for (version_t v = ver-1; v >= closest_pinned; --v) {
4721     if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4722                                 &osdm_bl)) {
4723       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4724       closest_pinned = v;
4725       has_cached_osdmap = true;
4726       break;
4727     }
4728   }
4729
4730   if (!has_cached_osdmap) {
4731     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4732     if (err != 0) {
4733       derr << __func__ << " closest pinned map ver " << closest_pinned
4734            << " not available! error: " << cpp_strerror(err) << dendl;
4735     }
4736     ceph_assert(err == 0);
4737   }
4738
4739   ceph_assert(osdm_bl.length());
4740
4741   OSDMap osdm;
4742   osdm.decode(osdm_bl);
4743
4744   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4745            << " e" << osdm.epoch
4746            << " crc " << osdm.get_crc()
4747            << " -- applying incremental maps." << dendl;
4748
4749   uint64_t encode_features = 0;
4750   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4751     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4752
4753     OSDMap::Incremental inc;
4754     int err = get_inc(v, inc);
4755     ceph_assert(err == 0);
4756
4757     encode_features = inc.encode_features;
4758
4759     err = osdm.apply_incremental(inc);
4760     ceph_assert(err == 0);
4761
4762     // this block performs paranoid checks on map retrieval
4763     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4764         inc.full_crc != 0) {
4765
4766       uint64_t f = encode_features;
4767       if (!f) {
4768         f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4769       }
4770
4771       // encode osdmap to force calculating crcs
4772       bufferlist tbl;
4773       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4774       // decode osdmap to compare crcs with what's expected by incremental
4775       OSDMap tosdm;
4776       tosdm.decode(tbl);
4777
4778       if (tosdm.get_crc() != inc.full_crc) {
4779         derr << __func__
4780              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4781              << ", expected " << inc.full_crc << ")" << dendl;
4782         ceph_abort_msg("osdmap crc mismatch");
4783       }
4784     }
4785
4786     // note: we cannot add the recently computed map to the cache, as is,
4787     // because we have not encoded the map into a bl.
4788   }
4789
4790   if (!encode_features) {
4791     dout(10) << __func__
4792              << " last incremental map didn't have features;"
4793              << " defaulting to quorum's or all" << dendl;
4794     encode_features =
4795       (mon.quorum_con_features ? mon.quorum_con_features : -1);
4796   }
4797   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4798
4799   return 0;
4800 }
4801
4802 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4803 {
4804   return get_version_full(ver, mon.get_quorum_con_features(), bl);
4805 }
4806
4807 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4808                                  bufferlist& bl)
4809 {
4810   uint64_t significant_features = OSDMap::get_significant_features(features);
4811   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4812     return 0;
4813   }
4814   int ret = PaxosService::get_version_full(ver, bl);
4815   if (ret == -ENOENT) {
4816     // build map?
4817     ret = get_full_from_pinned_map(ver, bl);
4818   }
4819   if (ret < 0) {
4820     return ret;
4821   }
4822   // NOTE: this check is imprecise; the OSDMap encoding features may
4823   // be a subset of the latest mon quorum features, but worst case we
4824   // reencode once and then cache the (identical) result under both
4825   // feature masks.
4826   if (significant_features !=
4827       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4828     reencode_full_map(bl, features);
4829   }
4830   full_osd_cache.add_bytes({ver, significant_features}, bl);
4831   return 0;
4832 }
4833
4834 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4835 {
4836   dout(10) << "blocklist " << av << " until " << until << dendl;
4837   for (auto a : av.v) {
4838     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4839       a.set_type(entity_addr_t::TYPE_ANY);
4840     } else {
4841       a.set_type(entity_addr_t::TYPE_LEGACY);
4842     }
4843     pending_inc.new_blocklist[a] = until;
4844   }
4845   return pending_inc.epoch;
4846 }
4847
4848 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4849 {
4850   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4851     a.set_type(entity_addr_t::TYPE_ANY);
4852   } else {
4853     a.set_type(entity_addr_t::TYPE_LEGACY);
4854   }
4855   dout(10) << "blocklist " << a << " until " << until << dendl;
4856   pending_inc.new_blocklist[a] = until;
4857   return pending_inc.epoch;
4858 }
4859
4860
4861 void OSDMonitor::check_osdmap_subs()
4862 {
4863   dout(10) << __func__ << dendl;
4864   if (!osdmap.get_epoch()) {
4865     return;
4866   }
4867   auto osdmap_subs = mon.session_map.subs.find("osdmap");
4868   if (osdmap_subs == mon.session_map.subs.end()) {
4869     return;
4870   }
4871   auto p = osdmap_subs->second->begin();
4872   while (!p.end()) {
4873     auto sub = *p;
4874     ++p;
4875     check_osdmap_sub(sub);
4876   }
4877 }
4878
4879 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4880 {
4881   dout(10) << __func__ << " " << sub << " next " << sub->next
4882            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4883   if (sub->next <= osdmap.get_epoch()) {
4884     if (sub->next >= 1)
4885       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4886     else
4887       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4888     if (sub->onetime)
4889       mon.session_map.remove_sub(sub);
4890     else
4891       sub->next = osdmap.get_epoch() + 1;
4892   }
4893 }
4894
4895 void OSDMonitor::check_pg_creates_subs()
4896 {
4897   if (!osdmap.get_num_up_osds()) {
4898     return;
4899   }
4900   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4901   mon.with_session_map([this](const MonSessionMap& session_map) {
4902       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4903       if (pg_creates_subs == session_map.subs.end()) {
4904         return;
4905       }
4906       for (auto sub : *pg_creates_subs->second) {
4907         check_pg_creates_sub(sub);
4908       }
4909     });
4910 }
4911
4912 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4913 {
4914   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4915   ceph_assert(sub->type == "osd_pg_creates");
4916   // only send these if the OSD is up.  we will check_subs() when they do
4917   // come up so they will get the creates then.
4918   if (sub->session->name.is_osd() &&
4919       mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4920     sub->next = send_pg_creates(sub->session->name.num(),
4921                                 sub->session->con.get(),
4922                                 sub->next);
4923   }
4924 }
4925
4926 void OSDMonitor::do_application_enable(int64_t pool_id,
4927                                        const std::string &app_name,
4928                                        const std::string &app_key,
4929                                        const std::string &app_value,
4930                                        bool force)
4931 {
4932   ceph_assert(paxos.is_plugged() && is_writeable());
4933
4934   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4935            << dendl;
4936
4937   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4938
4939   auto pp = osdmap.get_pg_pool(pool_id);
4940   ceph_assert(pp != nullptr);
4941
4942   pg_pool_t p = *pp;
4943   if (pending_inc.new_pools.count(pool_id)) {
4944     p = pending_inc.new_pools[pool_id];
4945   }
4946
4947   if (app_key.empty()) {
4948     p.application_metadata.insert({app_name, {}});
4949   } else {
4950     if (force) {
4951       p.application_metadata[app_name][app_key] = app_value;
4952     } else {
4953       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4954     }
4955   }
4956   p.last_change = pending_inc.epoch;
4957   pending_inc.new_pools[pool_id] = p;
4958 }
4959
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4961                                  pool_opts_t::key_t opt,
4962                                  pool_opts_t::value_t val)
4963 {
4964   dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4965            << " val: " << val << dendl;
4966   auto p = pending_inc.new_pools.try_emplace(
4967     pool_id, *osdmap.get_pg_pool(pool_id));
4968   p.first->second.opts.set(opt, val);
4969 }
4970
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4973   const mempool::osdmap::set<int64_t>& removed_pools,
4974   utime_t modified,
4975   creating_pgs_t* creating_pgs) const
4976 {
4977   unsigned queued = 0;
4978   for (auto& p : pools) {
4979     int64_t poolid = p.first;
4980     if (creating_pgs->created_pools.count(poolid)) {
4981       dout(10) << __func__ << " already created " << poolid << dendl;
4982       continue;
4983     }
4984     const pg_pool_t& pool = p.second;
4985     int ruleno = pool.get_crush_rule();
4986     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4987       continue;
4988
4989     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4990     const auto created = pool.get_last_change();
4991     if (last_scan_epoch && created <= last_scan_epoch) {
4992       dout(10) << __func__ << " no change in pool " << poolid
4993                << " " << pool << dendl;
4994       continue;
4995     }
4996     if (removed_pools.count(poolid)) {
4997       dout(10) << __func__ << " pool is being removed: " << poolid
4998                << " " << pool << dendl;
4999       continue;
5000     }
5001     dout(10) << __func__ << " queueing pool create for " << poolid
5002              << " " << pool << dendl;
5003     creating_pgs->create_pool(poolid, pool.get_pg_num(),
5004                               created, modified);
5005     queued++;
5006   }
5007   return queued;
5008 }
5009
5010 void OSDMonitor::update_creating_pgs()
5011 {
5012   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
5013            << creating_pgs.queue.size() << " pools in queue" << dendl;
5014   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
5015   std::lock_guard<std::mutex> l(creating_pgs_lock);
5016   for (const auto& pg : creating_pgs.pgs) {
5017     int acting_primary = -1;
5018     auto pgid = pg.first;
5019     if (!osdmap.pg_exists(pgid)) {
5020       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
5021                << dendl;
5022       continue;
5023     }
5024     auto mapped = pg.second.create_epoch;
5025     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
5026     spg_t spgid(pgid);
5027     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
5028     // check the previous creating_pgs, look for the target to whom the pg was
5029     // previously mapped
5030     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5031       const auto last_acting_primary = pgs_by_epoch.first;
5032       for (auto& pgs: pgs_by_epoch.second) {
5033         if (pgs.second.count(spgid)) {
5034           if (last_acting_primary == acting_primary) {
5035             mapped = pgs.first;
5036           } else {
5037             dout(20) << __func__ << " " << pgid << " "
5038                      << " acting_primary:" << last_acting_primary
5039                      << " -> " << acting_primary << dendl;
5040             // note epoch if the target of the create message changed.
5041             mapped = mapping.get_epoch();
5042           }
5043           break;
5044         } else {
5045           // newly creating
5046           mapped = mapping.get_epoch();
5047         }
5048       }
5049     }
5050     dout(10) << __func__ << " will instruct osd." << acting_primary
5051              << " to create " << pgid << "@" << mapped << dendl;
5052     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5053   }
5054   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5055   creating_pgs_epoch = mapping.get_epoch();
5056 }
5057
5058 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5059 {
5060   dout(30) << __func__ << " osd." << osd << " next=" << next
5061            << " " << creating_pgs_by_osd_epoch << dendl;
5062   std::lock_guard<std::mutex> l(creating_pgs_lock);
5063   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5064     dout(20) << __func__
5065              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5066     // the subscribers will be updated when the mapping is completed anyway
5067     return next;
5068   }
5069   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5070   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5071     return next;
5072   ceph_assert(!creating_pgs_by_epoch->second.empty());
5073
5074   auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
5075
5076   epoch_t last = 0;
5077   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5078        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5079     auto epoch = epoch_pgs->first;
5080     auto& pgs = epoch_pgs->second;
5081     dout(20) << __func__ << " osd." << osd << " from " << next
5082              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5083     last = epoch;
5084     for (auto& pg : pgs) {
5085       // Need the create time from the monitor using its clock to set
5086       // last_scrub_stamp upon pg creation.
5087       auto create = creating_pgs.pgs.find(pg.pgid);
5088       ceph_assert(create != creating_pgs.pgs.end());
5089       m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5090                              create->second.create_stamp));
5091       if (create->second.history.epoch_created) {
5092         dout(20) << __func__ << "   " << pg << " " << create->second.history
5093            << " " << create->second.past_intervals << dendl;
5094         m->pg_extra.emplace(pg, make_pair(create->second.history,
5095                                     create->second.past_intervals));
5096       }
5097       dout(20) << __func__ << " will create " << pg
5098                << " at " << create->second.create_epoch << dendl;
5099     }
5100   }
5101   if (!m->pgs.empty()) {
5102     con->send_message2(std::move(m));
5103   } else {
5104     dout(20) << __func__ << " osd." << osd << " from " << next
5105              << " has nothing to send" << dendl;
5106     return next;
5107   }
5108
5109   // sub is current through last + 1
5110   return last + 1;
5111 }
5112
5113 // TICK
5114
5115
5116 void OSDMonitor::tick()
5117 {
5118   if (!is_active()) return;
5119
5120   dout(10) << osdmap << dendl;
5121
5122   // always update osdmap manifest, regardless of being the leader.
5123   load_osdmap_manifest();
5124
5125   // always tune priority cache manager memory on leader and peons
5126   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5127     std::lock_guard l(balancer_lock);
5128     if (pcm != nullptr) {
5129       pcm->tune_memory();
5130       pcm->balance();
5131       _set_new_cache_sizes();
5132       dout(10) << "tick balancer "
5133                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5134                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5135                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5136                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5137                << dendl;
5138       dout(10) << "tick balancer "
5139                << " full cache_bytes: " << full_cache->get_cache_bytes()
5140                << " full comtd_bytes: " << full_cache->get_committed_size()
5141                << " full used_bytes: " << full_cache->_get_used_bytes()
5142                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5143                << dendl;
5144     }
5145   }
5146
5147   if (!mon.is_leader()) return;
5148
5149   bool do_propose = false;
5150   utime_t now = ceph_clock_now();
5151
5152   if (handle_osd_timeouts(now, last_osd_report)) {
5153     do_propose = true;
5154   }
5155
5156   // mark osds down?
5157   if (check_failures(now)) {
5158     do_propose = true;
5159   }
5160
5161   // Force a proposal if we need to prune; pruning is performed on
5162   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163   // even if there's nothing going on.
5164   if (is_prune_enabled() && should_prune()) {
5165     do_propose = true;
5166   }
5167
5168   // mark down osds out?
5169
5170   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171    * influence at all. The decision is made based on the ratio of "in" osds,
5172    * and the function returns false if this ratio is lower that the minimum
5173    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5174    */
5175   if (can_mark_out(-1)) {
5176     string down_out_subtree_limit = g_conf().get_val<string>(
5177       "mon_osd_down_out_subtree_limit");
5178     set<int> down_cache;  // quick cache of down subtrees
5179
5180     map<int,utime_t>::iterator i = down_pending_out.begin();
5181     while (i != down_pending_out.end()) {
5182       int o = i->first;
5183       utime_t down = now;
5184       down -= i->second;
5185       ++i;
5186
5187       if (osdmap.is_down(o) &&
5188           osdmap.is_in(o) &&
5189           can_mark_out(o)) {
5190         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5191         utime_t grace = orig_grace;
5192         double my_grace = 0.0;
5193
5194         if (g_conf()->mon_osd_adjust_down_out_interval) {
5195           // scale grace period the same way we do the heartbeat grace.
5196           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5197           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5198           double decay_k = ::log(.5) / halflife;
5199           double decay = exp((double)down * decay_k);
5200           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5201                    << " down for " << down << " decay " << decay << dendl;
5202           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5203           grace += my_grace;
5204         }
5205
5206         // is this an entire large subtree down?
5207         if (down_out_subtree_limit.length()) {
5208           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5209           if (type > 0) {
5210             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5211               dout(10) << "tick entire containing " << down_out_subtree_limit
5212                        << " subtree for osd." << o
5213                        << " is down; resetting timer" << dendl;
5214               // reset timer, too.
5215               down_pending_out[o] = now;
5216               continue;
5217             }
5218           }
5219         }
5220
5221         bool down_out = !osdmap.is_destroyed(o) &&
5222           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5223         bool destroyed_out = osdmap.is_destroyed(o) &&
5224           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5225         // this is not precise enough as we did not make a note when this osd
5226         // was marked as destroyed, but let's not bother with that
5227         // complexity for now.
5228           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5229         if (down_out || destroyed_out) {
5230           dout(10) << "tick marking osd." << o << " OUT after " << down
5231                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5232           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5233
5234           // set the AUTOOUT bit.
5235           if (pending_inc.new_state.count(o) == 0)
5236             pending_inc.new_state[o] = 0;
5237           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5238
5239           // remember previous weight
5240           if (pending_inc.new_xinfo.count(o) == 0)
5241             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5242           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5243
5244           do_propose = true;
5245
5246           mon.clog->info() << "Marking osd." << o << " out (has been down for "
5247                             << int(down.sec()) << " seconds)";
5248         } else
5249           continue;
5250       }
5251
5252       down_pending_out.erase(o);
5253     }
5254   } else {
5255     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5256   }
5257
5258   // expire blocklisted items?
5259   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5260        p != osdmap.blocklist.end();
5261        ++p) {
5262     if (p->second < now) {
5263       dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5264       pending_inc.old_blocklist.push_back(p->first);
5265       do_propose = true;
5266     }
5267   }
5268   for (auto p = osdmap.range_blocklist.begin();
5269        p != osdmap.range_blocklist.end();
5270        ++p) {
5271     if (p->second < now) {
5272       dout(10) << "expiring range_blocklist item " << p->first
5273                << " expired " << p->second << " < now " << now << dendl;
5274       pending_inc.old_range_blocklist.push_back(p->first);
5275       do_propose = true;
5276     }
5277   }
5278
5279   if (try_prune_purged_snaps()) {
5280     do_propose = true;
5281   }
5282
5283   if (update_pools_status())
5284     do_propose = true;
5285
5286   if (do_propose ||
5287       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5288     propose_pending();
5289 }
5290
5291 void OSDMonitor::_set_new_cache_sizes()
5292 {
5293   uint64_t cache_size = 0;
5294   int64_t inc_alloc = 0;
5295   int64_t full_alloc = 0;
5296   int64_t kv_alloc = 0;
5297
5298   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5299     cache_size = pcm->get_tuned_mem();
5300     inc_alloc = inc_cache->get_committed_size();
5301     full_alloc = full_cache->get_committed_size();
5302     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5303   }
5304
5305   inc_osd_cache.set_bytes(inc_alloc);
5306   full_osd_cache.set_bytes(full_alloc);
5307
5308   dout(1) << __func__ << " cache_size:" << cache_size
5309            << " inc_alloc: " << inc_alloc
5310            << " full_alloc: " << full_alloc
5311            << " kv_alloc: " << kv_alloc
5312            << dendl;
5313 }
5314
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5316                                      std::map<int, std::pair<utime_t, int>> &last_osd_report)
5317 {
5318   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5319   if (now - mon.get_leader_since() < timeo) {
5320     // We haven't been the leader for long enough to consider OSD timeouts
5321     return false;
5322   }
5323
5324   int max_osd = osdmap.get_max_osd();
5325   bool new_down = false;
5326
5327   for (int i=0; i < max_osd; ++i) {
5328     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5329     if (!osdmap.exists(i)) {
5330       last_osd_report.erase(i); // if any
5331       continue;
5332     }
5333     if (!osdmap.is_up(i))
5334       continue;
5335     const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5336     if (t == last_osd_report.end()) {
5337       // it wasn't in the map; start the timer.
5338       last_osd_report[i].first = now;
5339       last_osd_report[i].second = 0;
5340     } else if (can_mark_down(i)) {
5341       utime_t diff = now - t->second.first;
5342       // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343       // to allow for the osd to miss a beacon.
5344       int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5345       utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
5346       if (diff > max_timeout) {
5347         mon.clog->info() << "osd." << i << " marked down after no beacon for "
5348                           << diff << " seconds";
5349         derr << "no beacon from osd." << i << " since " << t->second.first
5350              << ", " << diff << " seconds ago.  marking down" << dendl;
5351         pending_inc.new_state[i] = CEPH_OSD_UP;
5352         new_down = true;
5353       }
5354     }
5355   }
5356   return new_down;
5357 }
5358
5359 static void dump_cpu_list(Formatter *f, const char *name,
5360                           const string& strlist)
5361 {
5362   cpu_set_t cpu_set;
5363   size_t cpu_set_size;
5364   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5365     return;
5366   }
5367   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5368   f->open_array_section(name);
5369   for (auto cpu : cpus) {
5370     f->dump_int("cpu", cpu);
5371   }
5372   f->close_section();
5373 }
5374
5375 void OSDMonitor::dump_info(Formatter *f)
5376 {
5377   f->open_object_section("osdmap");
5378   osdmap.dump(f, cct);
5379   f->close_section();
5380
5381   f->open_array_section("osd_metadata");
5382   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5383     if (osdmap.exists(i)) {
5384       f->open_object_section("osd");
5385       f->dump_unsigned("id", i);
5386       dump_osd_metadata(i, f, NULL);
5387       f->close_section();
5388     }
5389   }
5390   f->close_section();
5391
5392   f->open_object_section("osdmap_clean_epochs");
5393   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5394
5395   f->open_object_section("last_epoch_clean");
5396   last_epoch_clean.dump(f);
5397   f->close_section();
5398
5399   f->open_array_section("osd_epochs");
5400   for (auto& osd_epoch : osd_epochs) {
5401     f->open_object_section("osd");
5402     f->dump_unsigned("id", osd_epoch.first);
5403     f->dump_unsigned("epoch", osd_epoch.second);
5404     f->close_section();
5405   }
5406   f->close_section(); // osd_epochs
5407
5408   f->close_section(); // osd_clean_epochs
5409
5410   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5411   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5412
5413   f->open_object_section("crushmap");
5414   osdmap.crush->dump(f);
5415   f->close_section();
5416
5417   if (has_osdmap_manifest) {
5418     f->open_object_section("osdmap_manifest");
5419     osdmap_manifest.dump(f);
5420     f->close_section();
5421   }
5422 }
5423
5424 namespace {
5425   enum osd_pool_get_choices {
5426     SIZE, MIN_SIZE,
5427     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5428     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5429     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5430     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5431     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5432     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5433     CACHE_TARGET_FULL_RATIO,
5434     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5435     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5436     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5437     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5438     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5439     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5440     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5441     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5442     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5443     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5444     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5445     DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5446
5447   std::set<osd_pool_get_choices>
5448     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5449                                 const std::set<osd_pool_get_choices>& second)
5450     {
5451       std::set<osd_pool_get_choices> result;
5452       std::set_difference(first.begin(), first.end(),
5453                           second.begin(), second.end(),
5454                           std::inserter(result, result.end()));
5455       return result;
5456     }
5457 }
5458
5459
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5461 {
5462   op->mark_osdmon_event(__func__);
5463   auto m = op->get_req<MMonCommand>();
5464   int r = 0;
5465   bufferlist rdata;
5466   stringstream ss, ds;
5467
5468   cmdmap_t cmdmap;
5469   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5470     string rs = ss.str();
5471     mon.reply_command(op, -EINVAL, rs, get_last_committed());
5472     return true;
5473   }
5474
5475   MonSession *session = op->get_session();
5476   if (!session) {
5477     derr << __func__ << " no session" << dendl;
5478     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5479     return true;
5480   }
5481
5482   string prefix;
5483   cmd_getval(cmdmap, "prefix", prefix);
5484
5485   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5486   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5487
5488   if (prefix == "osd stat") {
5489     if (f) {
5490       f->open_object_section("osdmap");
5491       osdmap.print_summary(f.get(), ds, "", true);
5492       f->close_section();
5493       f->flush(rdata);
5494     } else {
5495       osdmap.print_summary(nullptr, ds, "", true);
5496       rdata.append(ds);
5497     }
5498   }
5499   else if (prefix == "osd dump" ||
5500            prefix == "osd tree" ||
5501            prefix == "osd tree-from" ||
5502            prefix == "osd ls" ||
5503            prefix == "osd getmap" ||
5504            prefix == "osd getcrushmap" ||
5505            prefix == "osd ls-tree" ||
5506            prefix == "osd info") {
5507
5508     epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5509     bufferlist osdmap_bl;
5510     int err = get_version_full(epoch, osdmap_bl);
5511     if (err == -ENOENT) {
5512       r = -ENOENT;
5513       ss << "there is no map for epoch " << epoch;
5514       goto reply;
5515     }
5516     ceph_assert(err == 0);
5517     ceph_assert(osdmap_bl.length());
5518
5519     OSDMap *p;
5520     if (epoch == osdmap.get_epoch()) {
5521       p = &osdmap;
5522     } else {
5523       p = new OSDMap;
5524       p->decode(osdmap_bl);
5525     }
5526
5527     auto sg = make_scope_guard([&] {
5528       if (p != &osdmap) {
5529         delete p;
5530       }
5531     });
5532
5533     if (prefix == "osd dump") {
5534       stringstream ds;
5535       if (f) {
5536         f->open_object_section("osdmap");
5537         p->dump(f.get(), cct);
5538         f->close_section();
5539         f->flush(ds);
5540       } else {
5541         p->print(cct, ds);
5542       }
5543       rdata.append(ds);
5544       if (!f)
5545         ds << " ";
5546     } else if (prefix == "osd ls") {
5547       if (f) {
5548         f->open_array_section("osds");
5549         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5550           if (osdmap.exists(i)) {
5551             f->dump_int("osd", i);
5552           }
5553         }
5554         f->close_section();
5555         f->flush(ds);
5556       } else {
5557         bool first = true;
5558         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5559           if (osdmap.exists(i)) {
5560             if (!first)
5561               ds << "\n";
5562             first = false;
5563             ds << i;
5564           }
5565         }
5566       }
5567       rdata.append(ds);
5568     } else if (prefix == "osd info") {
5569       int64_t osd_id;
5570       bool do_single_osd = true;
5571       if (!cmd_getval(cmdmap, "id", osd_id)) {
5572         do_single_osd = false;
5573       }
5574
5575       if (do_single_osd && !osdmap.exists(osd_id)) {
5576         ss << "osd." << osd_id << " does not exist";
5577         r = -EINVAL;
5578         goto reply;
5579       }
5580
5581       if (f) {
5582         if (do_single_osd) {
5583           osdmap.dump_osd(osd_id, f.get());
5584         } else {
5585           osdmap.dump_osds(f.get());
5586         }
5587         f->flush(ds);
5588       } else {
5589         if (do_single_osd) {
5590           osdmap.print_osd(osd_id, ds);
5591         } else {
5592           osdmap.print_osds(ds);
5593         }
5594       }
5595       rdata.append(ds);
5596     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5597       string bucket;
5598       if (prefix == "osd tree-from") {
5599         cmd_getval(cmdmap, "bucket", bucket);
5600         if (!osdmap.crush->name_exists(bucket)) {
5601           ss << "bucket '" << bucket << "' does not exist";
5602           r = -ENOENT;
5603           goto reply;
5604         }
5605         int id = osdmap.crush->get_item_id(bucket);
5606         if (id >= 0) {
5607           ss << "\"" << bucket << "\" is not a bucket";
5608           r = -EINVAL;
5609           goto reply;
5610         }
5611       }
5612
5613       vector<string> states;
5614       cmd_getval(cmdmap, "states", states);
5615       unsigned filter = 0;
5616       for (auto& s : states) {
5617         if (s == "up") {
5618           filter |= OSDMap::DUMP_UP;
5619         } else if (s == "down") {
5620           filter |= OSDMap::DUMP_DOWN;
5621         } else if (s == "in") {
5622           filter |= OSDMap::DUMP_IN;
5623         } else if (s == "out") {
5624           filter |= OSDMap::DUMP_OUT;
5625         } else if (s == "destroyed") {
5626           filter |= OSDMap::DUMP_DESTROYED;
5627         } else {
5628           ss << "unrecognized state '" << s << "'";
5629           r = -EINVAL;
5630           goto reply;
5631         }
5632       }
5633       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5634           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5635         ss << "cannot specify both 'in' and 'out'";
5636         r = -EINVAL;
5637         goto reply;
5638       }
5639       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5640            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5641            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5642            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5643            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5644            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5645         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5646         r = -EINVAL;
5647         goto reply;
5648       }
5649       if (f) {
5650         f->open_object_section("tree");
5651         p->print_tree(f.get(), NULL, filter, bucket);
5652         f->close_section();
5653         f->flush(ds);
5654       } else {
5655         p->print_tree(NULL, &ds, filter, bucket);
5656       }
5657       rdata.append(ds);
5658     } else if (prefix == "osd getmap") {
5659       rdata.append(osdmap_bl);
5660       ss << "got osdmap epoch " << p->get_epoch();
5661     } else if (prefix == "osd getcrushmap") {
5662       p->crush->encode(rdata, mon.get_quorum_con_features());
5663       ss << p->get_crush_version();
5664     } else if (prefix == "osd ls-tree") {
5665       string bucket_name;
5666       cmd_getval(cmdmap, "name", bucket_name);
5667       set<int> osds;
5668       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5669       if (r == -ENOENT) {
5670         ss << "\"" << bucket_name << "\" does not exist";
5671         goto reply;
5672       } else if (r < 0) {
5673         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5674         goto reply;
5675       }
5676
5677       if (f) {
5678         f->open_array_section("osds");
5679         for (auto &i : osds) {
5680           if (osdmap.exists(i)) {
5681             f->dump_int("osd", i);
5682           }
5683         }
5684         f->close_section();
5685         f->flush(ds);
5686       } else {
5687         bool first = true;
5688         for (auto &i : osds) {
5689           if (osdmap.exists(i)) {
5690             if (!first)
5691               ds << "\n";
5692             first = false;
5693             ds << i;
5694           }
5695         }
5696       }
5697
5698       rdata.append(ds);
5699     }
5700   } else if (prefix == "osd getmaxosd") {
5701     if (f) {
5702       f->open_object_section("getmaxosd");
5703       f->dump_unsigned("epoch", osdmap.get_epoch());
5704       f->dump_int("max_osd", osdmap.get_max_osd());
5705       f->close_section();
5706       f->flush(rdata);
5707     } else {
5708       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5709       rdata.append(ds);
5710     }
5711   } else if (prefix == "osd utilization") {
5712     string out;
5713     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5714     if (f)
5715       f->flush(rdata);
5716     else
5717       rdata.append(out);
5718     r = 0;
5719     goto reply;
5720   } else if (prefix  == "osd find") {
5721     int64_t osd;
5722     if (!cmd_getval(cmdmap, "id", osd)) {
5723       ss << "unable to parse osd id value '"
5724          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5725       r = -EINVAL;
5726       goto reply;
5727     }
5728     if (!osdmap.exists(osd)) {
5729       ss << "osd." << osd << " does not exist";
5730       r = -ENOENT;
5731       goto reply;
5732     }
5733     string format;
5734     cmd_getval(cmdmap, "format", format);
5735     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5736     f->open_object_section("osd_location");
5737     f->dump_int("osd", osd);
5738     f->dump_object("addrs", osdmap.get_addrs(osd));
5739     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5740
5741     // try to identify host, pod/container name, etc.
5742     map<string,string> m;
5743     load_metadata(osd, m, nullptr);
5744     if (auto p = m.find("hostname"); p != m.end()) {
5745       f->dump_string("host", p->second);
5746     }
5747     for (auto& k : {
5748         "pod_name", "pod_namespace", // set by rook
5749         "container_name"             // set by cephadm, ceph-ansible
5750         }) {
5751       if (auto p = m.find(k); p != m.end()) {
5752         f->dump_string(k, p->second);
5753       }
5754     }
5755
5756     // crush is helpful too
5757     f->open_object_section("crush_location");
5758     map<string,string> loc = osdmap.crush->get_full_location(osd);
5759     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5760       f->dump_string(p->first.c_str(), p->second);
5761     f->close_section();
5762     f->close_section();
5763     f->flush(rdata);
5764   } else if (prefix == "osd metadata") {
5765     int64_t osd = -1;
5766     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5767         !cmd_getval(cmdmap, "id", osd)) {
5768       ss << "unable to parse osd id value '"
5769          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5770       r = -EINVAL;
5771       goto reply;
5772     }
5773     if (osd >= 0 && !osdmap.exists(osd)) {
5774       ss << "osd." << osd << " does not exist";
5775       r = -ENOENT;
5776       goto reply;
5777     }
5778     string format;
5779     cmd_getval(cmdmap, "format", format);
5780     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5781     if (osd >= 0) {
5782       f->open_object_section("osd_metadata");
5783       f->dump_unsigned("id", osd);
5784       r = dump_osd_metadata(osd, f.get(), &ss);
5785       if (r < 0)
5786         goto reply;
5787       f->close_section();
5788     } else {
5789       r = 0;
5790       f->open_array_section("osd_metadata");
5791       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5792         if (osdmap.exists(i)) {
5793           f->open_object_section("osd");
5794           f->dump_unsigned("id", i);
5795           r = dump_osd_metadata(i, f.get(), NULL);
5796           if (r == -EINVAL || r == -ENOENT) {
5797             // Drop error, continue to get other daemons' metadata
5798             dout(4) << "No metadata for osd." << i << dendl;
5799             r = 0;
5800           } else if (r < 0) {
5801             // Unexpected error
5802             goto reply;
5803           }
5804           f->close_section();
5805         }
5806       }
5807       f->close_section();
5808     }
5809     f->flush(rdata);
5810   } else if (prefix == "osd versions") {
5811     if (!f)
5812       f.reset(Formatter::create("json-pretty"));
5813     count_metadata("ceph_version", f.get());
5814     f->flush(rdata);
5815     r = 0;
5816   } else if (prefix == "osd count-metadata") {
5817     if (!f)
5818       f.reset(Formatter::create("json-pretty"));
5819     string field;
5820     cmd_getval(cmdmap, "property", field);
5821     count_metadata(field, f.get());
5822     f->flush(rdata);
5823     r = 0;
5824   } else if (prefix == "osd numa-status") {
5825     TextTable tbl;
5826     if (f) {
5827       f->open_array_section("osds");
5828     } else {
5829       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5830       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5831       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5832       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5833       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5834       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5835     }
5836     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5837       if (osdmap.exists(i)) {
5838         map<string,string> m;
5839         ostringstream err;
5840         if (load_metadata(i, m, &err) < 0) {
5841           continue;
5842         }
5843         string host;
5844         auto p = m.find("hostname");
5845         if (p != m.end()) {
5846           host = p->second;
5847         }
5848         if (f) {
5849           f->open_object_section("osd");
5850           f->dump_int("osd", i);
5851           f->dump_string("host", host);
5852           for (auto n : { "network_numa_node", "objectstore_numa_node",
5853                 "numa_node" }) {
5854             p = m.find(n);
5855             if (p != m.end()) {
5856               f->dump_int(n, atoi(p->second.c_str()));
5857             }
5858           }
5859           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5860             p = m.find(n);
5861             if (p != m.end()) {
5862               list<string> ls = get_str_list(p->second, ",");
5863               f->open_array_section(n);
5864               for (auto node : ls) {
5865                 f->dump_int("node", atoi(node.c_str()));
5866               }
5867               f->close_section();
5868             }
5869           }
5870           for (auto n : { "numa_node_cpus" }) {
5871             p = m.find(n);
5872             if (p != m.end()) {
5873               dump_cpu_list(f.get(), n, p->second);
5874             }
5875           }
5876           f->close_section();
5877         } else {
5878           tbl << i;
5879           tbl << host;
5880           p = m.find("network_numa_nodes");
5881           if (p != m.end()) {
5882             tbl << p->second;
5883           } else {
5884             tbl << "-";
5885           }
5886           p = m.find("objectstore_numa_nodes");
5887           if (p != m.end()) {
5888             tbl << p->second;
5889           } else {
5890             tbl << "-";
5891           }
5892           p = m.find("numa_node");
5893           auto q = m.find("numa_node_cpus");
5894           if (p != m.end() && q != m.end()) {
5895             tbl << p->second;
5896             tbl << q->second;
5897           } else {
5898             tbl << "-";
5899             tbl << "-";
5900           }
5901           tbl << TextTable::endrow;
5902         }
5903       }
5904     }
5905     if (f) {
5906       f->close_section();
5907       f->flush(rdata);
5908     } else {
5909       rdata.append(stringify(tbl));
5910     }
5911   } else if (prefix == "osd map") {
5912     string poolstr, objstr, namespacestr;
5913     cmd_getval(cmdmap, "pool", poolstr);
5914     cmd_getval(cmdmap, "object", objstr);
5915     cmd_getval(cmdmap, "nspace", namespacestr);
5916
5917     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5918     if (pool < 0) {
5919       ss << "pool " << poolstr << " does not exist";
5920       r = -ENOENT;
5921       goto reply;
5922     }
5923     object_locator_t oloc(pool, namespacestr);
5924     object_t oid(objstr);
5925     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5926     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5927     vector<int> up, acting;
5928     int up_p, acting_p;
5929     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5930
5931     string fullobjname;
5932     if (!namespacestr.empty())
5933       fullobjname = namespacestr + string("/") + oid.name;
5934     else
5935       fullobjname = oid.name;
5936     if (f) {
5937       f->open_object_section("osd_map");
5938       f->dump_unsigned("epoch", osdmap.get_epoch());
5939       f->dump_string("pool", poolstr);
5940       f->dump_int("pool_id", pool);
5941       f->dump_stream("objname") << fullobjname;
5942       f->dump_stream("raw_pgid") << pgid;
5943       f->dump_stream("pgid") << mpgid;
5944       f->open_array_section("up");
5945       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5946         f->dump_int("osd", *p);
5947       f->close_section();
5948       f->dump_int("up_primary", up_p);
5949       f->open_array_section("acting");
5950       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5951         f->dump_int("osd", *p);
5952       f->close_section();
5953       f->dump_int("acting_primary", acting_p);
5954       f->close_section(); // osd_map
5955       f->flush(rdata);
5956     } else {
5957       ds << "osdmap e" << osdmap.get_epoch()
5958         << " pool '" << poolstr << "' (" << pool << ")"
5959         << " object '" << fullobjname << "' ->"
5960         << " pg " << pgid << " (" << mpgid << ")"
5961         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5962         << pg_vector_string(acting) << ", p" << acting_p << ")";
5963       rdata.append(ds);
5964     }
5965
5966   } else if (prefix == "pg map") {
5967     pg_t pgid;
5968     vector<int> up, acting;
5969     r = parse_pgid(cmdmap, ss, pgid);
5970     if (r < 0)
5971       goto reply;
5972     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5973     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5974     if (f) {
5975       f->open_object_section("pg_map");
5976       f->dump_unsigned("epoch", osdmap.get_epoch());
5977       f->dump_stream("raw_pgid") << pgid;
5978       f->dump_stream("pgid") << mpgid;
5979       f->open_array_section("up");
5980       for (auto osd : up) {
5981         f->dump_int("up_osd", osd);
5982       }
5983       f->close_section();
5984       f->open_array_section("acting");
5985       for (auto osd : acting) {
5986         f->dump_int("acting_osd", osd);
5987       }
5988       f->close_section();
5989       f->close_section();
5990       f->flush(rdata);
5991     } else {
5992       ds << "osdmap e" << osdmap.get_epoch()
5993          << " pg " << pgid << " (" << mpgid << ")"
5994          << " -> up " << up << " acting " << acting;
5995       rdata.append(ds);
5996     }
5997     goto reply;
5998
5999   } else if (prefix == "osd lspools") {
6000     if (f)
6001       f->open_array_section("pools");
6002     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6003          p != osdmap.pools.end();
6004          ++p) {
6005       if (f) {
6006         f->open_object_section("pool");
6007         f->dump_int("poolnum", p->first);
6008         f->dump_string("poolname", osdmap.pool_name[p->first]);
6009         f->close_section();
6010       } else {
6011         ds << p->first << ' ' << osdmap.pool_name[p->first];
6012         if (next(p) != osdmap.pools.end()) {
6013           ds << '\n';
6014         }
6015       }
6016     }
6017     if (f) {
6018       f->close_section();
6019       f->flush(ds);
6020     }
6021     rdata.append(ds);
6022   } else if (prefix == "osd blocklist ls" ||
6023              prefix == "osd blacklist ls") {
6024     if (f)
6025       f->open_array_section("blocklist");
6026
6027     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6028          p != osdmap.blocklist.end();
6029          ++p) {
6030       if (f) {
6031         f->open_object_section("entry");
6032         f->dump_string("addr", p->first.get_legacy_str());
6033         f->dump_stream("until") << p->second;
6034         f->close_section();
6035       } else {
6036         stringstream ss;
6037         string s;
6038         ss << p->first << " " << p->second;
6039         getline(ss, s);
6040         s += "\n";
6041         rdata.append(s);
6042       }
6043     }
6044     if (f) {
6045       f->close_section();
6046       f->flush(rdata);
6047     }
6048     if (f)
6049       f->open_array_section("range_blocklist");
6050
6051     for (auto p = osdmap.range_blocklist.begin();
6052          p != osdmap.range_blocklist.end();
6053          ++p) {
6054       if (f) {
6055         f->open_object_section("entry");
6056         f->dump_string("range", p->first.get_legacy_str());
6057         f->dump_stream("until") << p->second;
6058         f->close_section();
6059       } else {
6060         stringstream ss;
6061         string s;
6062         ss << p->first << " " << p->second;
6063         getline(ss, s);
6064         s += "\n";
6065         rdata.append(s);
6066       }
6067     }
6068     if (f) {
6069       f->close_section();
6070       f->flush(rdata);
6071     }
6072     ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6073
6074   } else if (prefix == "osd pool ls") {
6075     string detail;
6076     cmd_getval(cmdmap, "detail", detail);
6077     if (!f && detail == "detail") {
6078       ostringstream ss;
6079       osdmap.print_pools(cct, ss);
6080       rdata.append(ss.str());
6081     } else {
6082       if (f)
6083         f->open_array_section("pools");
6084       for (auto &[pid, pdata] : osdmap.get_pools()) {
6085         if (f) {
6086           if (detail == "detail") {
6087             f->open_object_section("pool");
6088             f->dump_int("pool_id", pid);
6089             f->dump_string("pool_name", osdmap.get_pool_name(pid));
6090             pdata.dump(f.get());
6091             osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
6092             f->close_section();
6093           } else {
6094             f->dump_string("pool_name", osdmap.get_pool_name(pid));
6095           }
6096         } else {
6097           rdata.append(osdmap.get_pool_name(pid) + "\n");
6098         }
6099       }
6100       if (f) {
6101         f->close_section();
6102         f->flush(rdata);
6103       }
6104     }
6105
6106   } else if (prefix == "osd crush get-tunable") {
6107     string tunable;
6108     cmd_getval(cmdmap, "tunable", tunable);
6109     ostringstream rss;
6110     if (f)
6111       f->open_object_section("tunable");
6112     if (tunable == "straw_calc_version") {
6113       if (f)
6114         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6115       else
6116         rss << osdmap.crush->get_straw_calc_version() << "\n";
6117     } else {
6118       r = -EINVAL;
6119       goto reply;
6120     }
6121     if (f) {
6122       f->close_section();
6123       f->flush(rdata);
6124     } else {
6125       rdata.append(rss.str());
6126     }
6127     r = 0;
6128
6129   } else if (prefix == "osd pool get") {
6130     string poolstr;
6131     cmd_getval(cmdmap, "pool", poolstr);
6132     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6133     if (pool < 0) {
6134       ss << "unrecognized pool '" << poolstr << "'";
6135       r = -ENOENT;
6136       goto reply;
6137     }
6138
6139     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6140     string var;
6141     cmd_getval(cmdmap, "var", var);
6142
6143     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6144     const choices_map_t ALL_CHOICES = {
6145       {"size", SIZE},
6146       {"min_size", MIN_SIZE},
6147       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6148       {"crush_rule", CRUSH_RULE},
6149       {"hashpspool", HASHPSPOOL},
6150       {"eio", POOL_EIO},
6151       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6152       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6153       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6154       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6155       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6156       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6157       {"use_gmt_hitset", USE_GMT_HITSET},
6158       {"target_max_objects", TARGET_MAX_OBJECTS},
6159       {"target_max_bytes", TARGET_MAX_BYTES},
6160       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6161       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6162       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6163       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6164       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6165       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6166       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6167       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6168       {"fast_read", FAST_READ},
6169       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6170       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6171       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6172       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6173       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6174       {"recovery_priority", RECOVERY_PRIORITY},
6175       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6176       {"scrub_priority", SCRUB_PRIORITY},
6177       {"compression_mode", COMPRESSION_MODE},
6178       {"compression_algorithm", COMPRESSION_ALGORITHM},
6179       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6180       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6181       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6182       {"csum_type", CSUM_TYPE},
6183       {"csum_max_block", CSUM_MAX_BLOCK},
6184       {"csum_min_block", CSUM_MIN_BLOCK},
6185       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6186       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6187       {"pg_num_min", PG_NUM_MIN},
6188       {"pg_num_max", PG_NUM_MAX},
6189       {"target_size_bytes", TARGET_SIZE_BYTES},
6190       {"target_size_ratio", TARGET_SIZE_RATIO},
6191       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6192       {"dedup_tier", DEDUP_TIER},
6193       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6194       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6195       {"bulk", BULK}
6196     };
6197
6198     typedef std::set<osd_pool_get_choices> choices_set_t;
6199
6200     const choices_set_t ONLY_TIER_CHOICES = {
6201       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6202       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6203       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6204       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6205       MIN_READ_RECENCY_FOR_PROMOTE,
6206       MIN_WRITE_RECENCY_FOR_PROMOTE,
6207       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6208     };
6209     const choices_set_t ONLY_ERASURE_CHOICES = {
6210       EC_OVERWRITES, ERASURE_CODE_PROFILE
6211     };
6212
6213     choices_set_t selected_choices;
6214     if (var == "all") {
6215       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6216           it != ALL_CHOICES.end(); ++it) {
6217         selected_choices.insert(it->second);
6218       }
6219
6220       if(!p->is_tier()) {
6221         selected_choices = subtract_second_from_first(selected_choices,
6222                                                       ONLY_TIER_CHOICES);
6223       }
6224
6225       if(!p->is_erasure()) {
6226         selected_choices = subtract_second_from_first(selected_choices,
6227                                                       ONLY_ERASURE_CHOICES);
6228       }
6229     } else /* var != "all" */  {
6230       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6231       if (found == ALL_CHOICES.end()) {
6232         ss << "pool '" << poolstr
6233                << "': invalid variable: '" << var << "'";
6234         r = -EINVAL;
6235         goto reply;
6236       }
6237
6238       osd_pool_get_choices selected = found->second;
6239
6240       if (!p->is_tier() &&
6241           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6242         ss << "pool '" << poolstr
6243            << "' is not a tier pool: variable not applicable";
6244         r = -EACCES;
6245         goto reply;
6246       }
6247
6248       if (!p->is_erasure() &&
6249           ONLY_ERASURE_CHOICES.find(selected)
6250           != ONLY_ERASURE_CHOICES.end()) {
6251         ss << "pool '" << poolstr
6252            << "' is not a erasure pool: variable not applicable";
6253         r = -EACCES;
6254         goto reply;
6255       }
6256
6257       if (pool_opts_t::is_opt_name(var) &&
6258           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6259         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6260         r = -ENOENT;
6261         goto reply;
6262       }
6263
6264       selected_choices.insert(selected);
6265     }
6266
6267     if (f) {
6268       f->open_object_section("pool");
6269       f->dump_string("pool", poolstr);
6270       f->dump_int("pool_id", pool);
6271       for(choices_set_t::const_iterator it = selected_choices.begin();
6272           it != selected_choices.end(); ++it) {
6273         choices_map_t::const_iterator i;
6274         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6275           if (i->second == *it) {
6276             break;
6277           }
6278         }
6279         ceph_assert(i != ALL_CHOICES.end());
6280         switch(*it) {
6281           case PG_NUM:
6282             f->dump_int("pg_num", p->get_pg_num());
6283             break;
6284           case PGP_NUM:
6285             f->dump_int("pgp_num", p->get_pgp_num());
6286             break;
6287           case SIZE:
6288             f->dump_int("size", p->get_size());
6289             break;
6290           case MIN_SIZE:
6291             f->dump_int("min_size", p->get_min_size());
6292             break;
6293           case CRUSH_RULE:
6294             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6295               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6296                                p->get_crush_rule()));
6297             } else {
6298               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6299             }
6300             break;
6301           case EC_OVERWRITES:
6302             f->dump_bool("allow_ec_overwrites",
6303                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6304             break;
6305           case PG_AUTOSCALE_MODE:
6306             f->dump_string("pg_autoscale_mode",
6307                            pg_pool_t::get_pg_autoscale_mode_name(
6308                              p->pg_autoscale_mode));
6309             break;
6310           case HASHPSPOOL:
6311           case POOL_EIO:
6312           case NODELETE:
6313           case BULK:
6314           case NOPGCHANGE:
6315           case NOSIZECHANGE:
6316           case WRITE_FADVISE_DONTNEED:
6317           case NOSCRUB:
6318           case NODEEP_SCRUB:
6319             f->dump_bool(i->first.c_str(),
6320                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6321             break;
6322           case HIT_SET_PERIOD:
6323             f->dump_int("hit_set_period", p->hit_set_period);
6324             break;
6325           case HIT_SET_COUNT:
6326             f->dump_int("hit_set_count", p->hit_set_count);
6327             break;
6328           case HIT_SET_TYPE:
6329             f->dump_string("hit_set_type",
6330                            HitSet::get_type_name(p->hit_set_params.get_type()));
6331             break;
6332           case HIT_SET_FPP:
6333             {
6334               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6335                 BloomHitSet::Params *bloomp =
6336                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6337                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6338               } else if(var != "all") {
6339                 f->close_section();
6340                 ss << "hit set is not of type Bloom; " <<
6341                   "invalid to get a false positive rate!";
6342                 r = -EINVAL;
6343                 goto reply;
6344               }
6345             }
6346             break;
6347           case USE_GMT_HITSET:
6348             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6349             break;
6350           case TARGET_MAX_OBJECTS:
6351             f->dump_unsigned("target_max_objects", p->target_max_objects);
6352             break;
6353           case TARGET_MAX_BYTES:
6354             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6355             break;
6356           case CACHE_TARGET_DIRTY_RATIO:
6357             f->dump_unsigned("cache_target_dirty_ratio_micro",
6358                              p->cache_target_dirty_ratio_micro);
6359             f->dump_float("cache_target_dirty_ratio",
6360                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6361             break;
6362           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6363             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364                              p->cache_target_dirty_high_ratio_micro);
6365             f->dump_float("cache_target_dirty_high_ratio",
6366                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6367             break;
6368           case CACHE_TARGET_FULL_RATIO:
6369             f->dump_unsigned("cache_target_full_ratio_micro",
6370                              p->cache_target_full_ratio_micro);
6371             f->dump_float("cache_target_full_ratio",
6372                           ((float)p->cache_target_full_ratio_micro/1000000));
6373             break;
6374           case CACHE_MIN_FLUSH_AGE:
6375             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6376             break;
6377           case CACHE_MIN_EVICT_AGE:
6378             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6379             break;
6380           case ERASURE_CODE_PROFILE:
6381             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6382             break;
6383           case MIN_READ_RECENCY_FOR_PROMOTE:
6384             f->dump_int("min_read_recency_for_promote",
6385                         p->min_read_recency_for_promote);
6386             break;
6387           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6388             f->dump_int("min_write_recency_for_promote",
6389                         p->min_write_recency_for_promote);
6390             break;
6391           case FAST_READ:
6392             f->dump_int("fast_read", p->fast_read);
6393             break;
6394           case HIT_SET_GRADE_DECAY_RATE:
6395             f->dump_int("hit_set_grade_decay_rate",
6396                         p->hit_set_grade_decay_rate);
6397             break;
6398           case HIT_SET_SEARCH_LAST_N:
6399             f->dump_int("hit_set_search_last_n",
6400                         p->hit_set_search_last_n);
6401             break;
6402           case SCRUB_MIN_INTERVAL:
6403           case SCRUB_MAX_INTERVAL:
6404           case DEEP_SCRUB_INTERVAL:
6405           case RECOVERY_PRIORITY:
6406           case RECOVERY_OP_PRIORITY:
6407           case SCRUB_PRIORITY:
6408           case COMPRESSION_MODE:
6409           case COMPRESSION_ALGORITHM:
6410           case COMPRESSION_REQUIRED_RATIO:
6411           case COMPRESSION_MAX_BLOB_SIZE:
6412           case COMPRESSION_MIN_BLOB_SIZE:
6413           case CSUM_TYPE:
6414           case CSUM_MAX_BLOCK:
6415           case CSUM_MIN_BLOCK:
6416           case FINGERPRINT_ALGORITHM:
6417           case PG_NUM_MIN:
6418           case PG_NUM_MAX:
6419           case TARGET_SIZE_BYTES:
6420           case TARGET_SIZE_RATIO:
6421           case PG_AUTOSCALE_BIAS:
6422           case DEDUP_TIER:
6423           case DEDUP_CHUNK_ALGORITHM:
6424           case DEDUP_CDC_CHUNK_SIZE:
6425             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6426             if (p->opts.is_set(key)) {
6427               if(*it == CSUM_TYPE) {
6428                 int64_t val;
6429                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6430                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6431               } else {
6432                 p->opts.dump(i->first, f.get());
6433               }
6434             }
6435             break;
6436         }
6437       }
6438       f->close_section();
6439       f->flush(rdata);
6440     } else /* !f */ {
6441       for(choices_set_t::const_iterator it = selected_choices.begin();
6442           it != selected_choices.end(); ++it) {
6443         choices_map_t::const_iterator i;
6444         switch(*it) {
6445           case PG_NUM:
6446             ss << "pg_num: " << p->get_pg_num() << "\n";
6447             break;
6448           case PGP_NUM:
6449             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6450             break;
6451           case SIZE:
6452             ss << "size: " << p->get_size() << "\n";
6453             break;
6454           case MIN_SIZE:
6455             ss << "min_size: " << p->get_min_size() << "\n";
6456             break;
6457           case CRUSH_RULE:
6458             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6459               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6460                 p->get_crush_rule()) << "\n";
6461             } else {
6462               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6463             }
6464             break;
6465           case PG_AUTOSCALE_MODE:
6466             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467               p->pg_autoscale_mode) <<"\n";
6468             break;
6469           case HIT_SET_PERIOD:
6470             ss << "hit_set_period: " << p->hit_set_period << "\n";
6471             break;
6472           case HIT_SET_COUNT:
6473             ss << "hit_set_count: " << p->hit_set_count << "\n";
6474             break;
6475           case HIT_SET_TYPE:
6476             ss << "hit_set_type: " <<
6477               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6478             break;
6479           case HIT_SET_FPP:
6480             {
6481               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6482                 BloomHitSet::Params *bloomp =
6483                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6484                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6485               } else if(var != "all") {
6486                 ss << "hit set is not of type Bloom; " <<
6487                   "invalid to get a false positive rate!";
6488                 r = -EINVAL;
6489                 goto reply;
6490               }
6491             }
6492             break;
6493           case USE_GMT_HITSET:
6494             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6495             break;
6496           case TARGET_MAX_OBJECTS:
6497             ss << "target_max_objects: " << p->target_max_objects << "\n";
6498             break;
6499           case TARGET_MAX_BYTES:
6500             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6501             break;
6502           case CACHE_TARGET_DIRTY_RATIO:
6503             ss << "cache_target_dirty_ratio: "
6504                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6505             break;
6506           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6507             ss << "cache_target_dirty_high_ratio: "
6508                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6509             break;
6510           case CACHE_TARGET_FULL_RATIO:
6511             ss << "cache_target_full_ratio: "
6512                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6513             break;
6514           case CACHE_MIN_FLUSH_AGE:
6515             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6516             break;
6517           case CACHE_MIN_EVICT_AGE:
6518             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6519             break;
6520           case ERASURE_CODE_PROFILE:
6521             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6522             break;
6523           case MIN_READ_RECENCY_FOR_PROMOTE:
6524             ss << "min_read_recency_for_promote: " <<
6525               p->min_read_recency_for_promote << "\n";
6526             break;
6527           case HIT_SET_GRADE_DECAY_RATE:
6528             ss << "hit_set_grade_decay_rate: " <<
6529               p->hit_set_grade_decay_rate << "\n";
6530             break;
6531           case HIT_SET_SEARCH_LAST_N:
6532             ss << "hit_set_search_last_n: " <<
6533               p->hit_set_search_last_n << "\n";
6534             break;
6535           case EC_OVERWRITES:
6536             ss << "allow_ec_overwrites: " <<
6537               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6538               "\n";
6539             break;
6540           case HASHPSPOOL:
6541           case POOL_EIO:
6542           case NODELETE:
6543           case BULK:
6544           case NOPGCHANGE:
6545           case NOSIZECHANGE:
6546           case WRITE_FADVISE_DONTNEED:
6547           case NOSCRUB:
6548           case NODEEP_SCRUB:
6549             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6550               if (i->second == *it)
6551                 break;
6552             }
6553             ceph_assert(i != ALL_CHOICES.end());
6554             ss << i->first << ": " <<
6555               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6556                "true" : "false") << "\n";
6557             break;
6558           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6559             ss << "min_write_recency_for_promote: " <<
6560               p->min_write_recency_for_promote << "\n";
6561             break;
6562           case FAST_READ:
6563             ss << "fast_read: " << p->fast_read << "\n";
6564             break;
6565           case SCRUB_MIN_INTERVAL:
6566           case SCRUB_MAX_INTERVAL:
6567           case DEEP_SCRUB_INTERVAL:
6568           case RECOVERY_PRIORITY:
6569           case RECOVERY_OP_PRIORITY:
6570           case SCRUB_PRIORITY:
6571           case COMPRESSION_MODE:
6572           case COMPRESSION_ALGORITHM:
6573           case COMPRESSION_REQUIRED_RATIO:
6574           case COMPRESSION_MAX_BLOB_SIZE:
6575           case COMPRESSION_MIN_BLOB_SIZE:
6576           case CSUM_TYPE:
6577           case CSUM_MAX_BLOCK:
6578           case CSUM_MIN_BLOCK:
6579           case FINGERPRINT_ALGORITHM:
6580           case PG_NUM_MIN:
6581           case PG_NUM_MAX:
6582           case TARGET_SIZE_BYTES:
6583           case TARGET_SIZE_RATIO:
6584           case PG_AUTOSCALE_BIAS:
6585           case DEDUP_TIER:
6586           case DEDUP_CHUNK_ALGORITHM:
6587           case DEDUP_CDC_CHUNK_SIZE:
6588             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6589               if (i->second == *it)
6590                 break;
6591             }
6592             ceph_assert(i != ALL_CHOICES.end());
6593             {
6594               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6595               if (p->opts.is_set(key)) {
6596                 if(key == pool_opts_t::CSUM_TYPE) {
6597                   int64_t val;
6598                   p->opts.get(key, &val);
6599                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6600                 } else {
6601                   ss << i->first << ": " << p->opts.get(key) << "\n";
6602                 }
6603               }
6604             }
6605             break;
6606         }
6607         rdata.append(ss.str());
6608         ss.str("");
6609       }
6610     }
6611     r = 0;
6612   } else if (prefix == "osd pool get-quota") {
6613     string pool_name;
6614     cmd_getval(cmdmap, "pool", pool_name);
6615
6616     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6617     if (poolid < 0) {
6618       ceph_assert(poolid == -ENOENT);
6619       ss << "unrecognized pool '" << pool_name << "'";
6620       r = -ENOENT;
6621       goto reply;
6622     }
6623     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6624     const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6625     if (!pstat) {
6626       ss << "no stats for pool '" << pool_name << "'";
6627       r = -ENOENT;
6628       goto reply;
6629     }
6630     const object_stat_sum_t& sum = pstat->stats.sum;
6631     if (f) {
6632       f->open_object_section("pool_quotas");
6633       f->dump_string("pool_name", pool_name);
6634       f->dump_unsigned("pool_id", poolid);
6635       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6636       f->dump_int("current_num_objects", sum.num_objects);
6637       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6638       f->dump_int("current_num_bytes", sum.num_bytes);
6639       f->close_section();
6640       f->flush(rdata);
6641     } else {
6642       stringstream rs;
6643       rs << "quotas for pool '" << pool_name << "':\n"
6644          << "  max objects: ";
6645       if (p->quota_max_objects == 0)
6646         rs << "N/A";
6647       else {
6648         rs << si_u_t(p->quota_max_objects) << " objects";
6649         rs << "  (current num objects: " << sum.num_objects << " objects)";
6650       }
6651       rs << "\n"
6652          << "  max bytes  : ";
6653       if (p->quota_max_bytes == 0)
6654         rs << "N/A";
6655       else {
6656         rs << byte_u_t(p->quota_max_bytes);
6657         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6658       }
6659       rdata.append(rs.str());
6660     }
6661     rdata.append("\n");
6662     r = 0;
6663   } else if (prefix == "osd crush rule list" ||
6664              prefix == "osd crush rule ls") {
6665     if (f) {
6666       f->open_array_section("rules");
6667       osdmap.crush->list_rules(f.get());
6668       f->close_section();
6669       f->flush(rdata);
6670     } else {
6671       ostringstream ss;
6672       osdmap.crush->list_rules(&ss);
6673       rdata.append(ss.str());
6674     }
6675   } else if (prefix == "osd crush rule ls-by-class") {
6676     string class_name;
6677     cmd_getval(cmdmap, "class", class_name);
6678     if (class_name.empty()) {
6679       ss << "no class specified";
6680       r = -EINVAL;
6681       goto reply;
6682     }
6683     set<int> rules;
6684     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6685     if (r < 0) {
6686       ss << "failed to get rules by class '" << class_name << "'";
6687       goto reply;
6688     }
6689     if (f) {
6690       f->open_array_section("rules");
6691       for (auto &rule: rules) {
6692         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6693       }
6694       f->close_section();
6695       f->flush(rdata);
6696     } else {
6697       ostringstream rs;
6698       for (auto &rule: rules) {
6699         rs << osdmap.crush->get_rule_name(rule) << "\n";
6700       }
6701       rdata.append(rs.str());
6702     }
6703   } else if (prefix == "osd crush rule dump") {
6704     string name;
6705     cmd_getval(cmdmap, "name", name);
6706     string format;
6707     cmd_getval(cmdmap, "format", format);
6708     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6709     if (name == "") {
6710       f->open_array_section("rules");
6711       osdmap.crush->dump_rules(f.get());
6712       f->close_section();
6713     } else {
6714       int ruleno = osdmap.crush->get_rule_id(name);
6715       if (ruleno < 0) {
6716         ss << "unknown crush rule '" << name << "'";
6717         r = ruleno;
6718         goto reply;
6719       }
6720       osdmap.crush->dump_rule(ruleno, f.get());
6721     }
6722     ostringstream rs;
6723     f->flush(rs);
6724     rs << "\n";
6725     rdata.append(rs.str());
6726   } else if (prefix == "osd crush dump") {
6727     string format;
6728     cmd_getval(cmdmap, "format", format);
6729     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6730     f->open_object_section("crush_map");
6731     osdmap.crush->dump(f.get());
6732     f->close_section();
6733     ostringstream rs;
6734     f->flush(rs);
6735     rs << "\n";
6736     rdata.append(rs.str());
6737   } else if (prefix == "osd crush show-tunables") {
6738     string format;
6739     cmd_getval(cmdmap, "format", format);
6740     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6741     f->open_object_section("crush_map_tunables");
6742     osdmap.crush->dump_tunables(f.get());
6743     f->close_section();
6744     ostringstream rs;
6745     f->flush(rs);
6746     rs << "\n";
6747     rdata.append(rs.str());
6748   } else if (prefix == "osd crush tree") {
6749     bool show_shadow = false;
6750     if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6751       std::string shadow;
6752       if (cmd_getval(cmdmap, "shadow", shadow) &&
6753           shadow == "--show-shadow") {
6754         show_shadow = true;
6755       }
6756     }
6757     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6758     if (f) {
6759       f->open_object_section("crush_tree");
6760       osdmap.crush->dump_tree(nullptr,
6761                               f.get(),
6762                               osdmap.get_pool_names(),
6763                               show_shadow);
6764       f->close_section();
6765       f->flush(rdata);
6766     } else {
6767       ostringstream ss;
6768       osdmap.crush->dump_tree(&ss,
6769                               nullptr,
6770                               osdmap.get_pool_names(),
6771                               show_shadow);
6772       rdata.append(ss.str());
6773     }
6774   } else if (prefix == "osd crush ls") {
6775     string name;
6776     if (!cmd_getval(cmdmap, "node", name)) {
6777       ss << "no node specified";
6778       r = -EINVAL;
6779       goto reply;
6780     }
6781     if (!osdmap.crush->name_exists(name)) {
6782       ss << "node '" << name << "' does not exist";
6783       r = -ENOENT;
6784       goto reply;
6785     }
6786     int id = osdmap.crush->get_item_id(name);
6787     list<int> result;
6788     if (id >= 0) {
6789       result.push_back(id);
6790     } else {
6791       int num = osdmap.crush->get_bucket_size(id);
6792       for (int i = 0; i < num; ++i) {
6793         result.push_back(osdmap.crush->get_bucket_item(id, i));
6794       }
6795     }
6796     if (f) {
6797       f->open_array_section("items");
6798       for (auto i : result) {
6799         f->dump_string("item", osdmap.crush->get_item_name(i));
6800       }
6801       f->close_section();
6802       f->flush(rdata);
6803     } else {
6804       ostringstream ss;
6805       for (auto i : result) {
6806         ss << osdmap.crush->get_item_name(i) << "\n";
6807       }
6808       rdata.append(ss.str());
6809     }
6810     r = 0;
6811   } else if (prefix == "osd crush class ls") {
6812     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6813     f->open_array_section("crush_classes");
6814     for (auto i : osdmap.crush->class_name)
6815       f->dump_string("class", i.second);
6816     f->close_section();
6817     f->flush(rdata);
6818   } else if (prefix == "osd crush class ls-osd") {
6819     string name;
6820     cmd_getval(cmdmap, "class", name);
6821     set<int> osds;
6822     osdmap.crush->get_devices_by_class(name, &osds);
6823     if (f) {
6824       f->open_array_section("osds");
6825       for (auto &osd: osds)
6826         f->dump_int("osd", osd);
6827       f->close_section();
6828       f->flush(rdata);
6829     } else {
6830       bool first = true;
6831       for (auto &osd : osds) {
6832         if (!first)
6833           ds << "\n";
6834         first = false;
6835         ds << osd;
6836       }
6837       rdata.append(ds);
6838     }
6839   } else if (prefix == "osd crush get-device-class") {
6840     vector<string> idvec;
6841     cmd_getval(cmdmap, "ids", idvec);
6842     map<int, string> class_by_osd;
6843     for (auto& id : idvec) {
6844       ostringstream ts;
6845       long osd = parse_osd_id(id.c_str(), &ts);
6846       if (osd < 0) {
6847         ss << "unable to parse osd id:'" << id << "'";
6848         r = -EINVAL;
6849         goto reply;
6850       }
6851       auto device_class = osdmap.crush->get_item_class(osd);
6852       if (device_class)
6853         class_by_osd[osd] = device_class;
6854       else
6855         class_by_osd[osd] = ""; // no class
6856     }
6857     if (f) {
6858       f->open_array_section("osd_device_classes");
6859       for (auto& i : class_by_osd) {
6860         f->open_object_section("osd_device_class");
6861         f->dump_int("osd", i.first);
6862         f->dump_string("device_class", i.second);
6863         f->close_section();
6864       }
6865       f->close_section();
6866       f->flush(rdata);
6867     } else {
6868       if (class_by_osd.size() == 1) {
6869         // for single input, make a clean output
6870         ds << class_by_osd.begin()->second;
6871       } else {
6872         // note that we do not group osds by class here
6873         for (auto it = class_by_osd.begin();
6874              it != class_by_osd.end();
6875              it++) {
6876           ds << "osd." << it->first << ' ' << it->second;
6877           if (next(it) != class_by_osd.end())
6878             ds << '\n';
6879         }
6880       }
6881       rdata.append(ds);
6882     }
6883   } else if (prefix == "osd erasure-code-profile ls") {
6884     const auto &profiles = osdmap.get_erasure_code_profiles();
6885     if (f)
6886       f->open_array_section("erasure-code-profiles");
6887     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6888       if (f)
6889         f->dump_string("profile", i->first.c_str());
6890       else
6891         rdata.append(i->first + "\n");
6892     }
6893     if (f) {
6894       f->close_section();
6895       ostringstream rs;
6896       f->flush(rs);
6897       rs << "\n";
6898       rdata.append(rs.str());
6899     }
6900   } else if (prefix == "osd crush weight-set ls") {
6901     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6902     if (f) {
6903       f->open_array_section("weight_sets");
6904       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6905         f->dump_string("pool", "(compat)");
6906       }
6907       for (auto& i : osdmap.crush->choose_args) {
6908         if (i.first >= 0) {
6909           f->dump_string("pool", osdmap.get_pool_name(i.first));
6910         }
6911       }
6912       f->close_section();
6913       f->flush(rdata);
6914     } else {
6915       ostringstream rs;
6916       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6917         rs << "(compat)\n";
6918       }
6919       for (auto& i : osdmap.crush->choose_args) {
6920         if (i.first >= 0) {
6921           rs << osdmap.get_pool_name(i.first) << "\n";
6922         }
6923       }
6924       rdata.append(rs.str());
6925     }
6926   } else if (prefix == "osd crush weight-set dump") {
6927     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6928                                                      "json-pretty"));
6929     osdmap.crush->dump_choose_args(f.get());
6930     f->flush(rdata);
6931   } else if (prefix == "osd erasure-code-profile get") {
6932     string name;
6933     cmd_getval(cmdmap, "name", name);
6934     if (!osdmap.has_erasure_code_profile(name)) {
6935       ss << "unknown erasure code profile '" << name << "'";
6936       r = -ENOENT;
6937       goto reply;
6938     }
6939     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6940     if (f)
6941       f->open_object_section("profile");
6942     for (map<string,string>::const_iterator i = profile.begin();
6943          i != profile.end();
6944          ++i) {
6945       if (f)
6946         f->dump_string(i->first.c_str(), i->second.c_str());
6947       else
6948         rdata.append(i->first + "=" + i->second + "\n");
6949     }
6950     if (f) {
6951       f->close_section();
6952       ostringstream rs;
6953       f->flush(rs);
6954       rs << "\n";
6955       rdata.append(rs.str());
6956     }
6957   } else if (prefix == "osd pool application get") {
6958     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6959                                                      "json-pretty"));
6960     string pool_name;
6961     cmd_getval(cmdmap, "pool", pool_name);
6962     string app;
6963     cmd_getval(cmdmap, "app", app);
6964     string key;
6965     cmd_getval(cmdmap, "key", key);
6966
6967     if (pool_name.empty()) {
6968       // all
6969       f->open_object_section("pools");
6970       for (const auto &pool : osdmap.pools) {
6971         std::string name("<unknown>");
6972         const auto &pni = osdmap.pool_name.find(pool.first);
6973         if (pni != osdmap.pool_name.end())
6974           name = pni->second;
6975         f->open_object_section(name.c_str());
6976         for (auto &app_pair : pool.second.application_metadata) {
6977           f->open_object_section(app_pair.first.c_str());
6978           for (auto &kv_pair : app_pair.second) {
6979             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6980           }
6981           f->close_section();
6982         }
6983         f->close_section(); // name
6984       }
6985       f->close_section(); // pools
6986       f->flush(rdata);
6987     } else {
6988       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6989       if (pool < 0) {
6990         ss << "unrecognized pool '" << pool_name << "'";
6991         r = -ENOENT;
6992         goto reply;
6993       }
6994       auto p = osdmap.get_pg_pool(pool);
6995       // filter by pool
6996       if (app.empty()) {
6997         f->open_object_section(pool_name.c_str());
6998         for (auto &app_pair : p->application_metadata) {
6999           f->open_object_section(app_pair.first.c_str());
7000           for (auto &kv_pair : app_pair.second) {
7001             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7002           }
7003           f->close_section(); // application
7004         }
7005         f->close_section(); // pool_name
7006         f->flush(rdata);
7007         goto reply;
7008       }
7009
7010       auto app_it = p->application_metadata.find(app);
7011       if (app_it == p->application_metadata.end()) {
7012         ss << "pool '" << pool_name << "' has no application '" << app << "'";
7013         r = -ENOENT;
7014         goto reply;
7015       }
7016       // filter by pool + app
7017       if (key.empty()) {
7018         f->open_object_section(app_it->first.c_str());
7019         for (auto &kv_pair : app_it->second) {
7020           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7021         }
7022         f->close_section(); // application
7023         f->flush(rdata);
7024         goto reply;
7025       }
7026       // filter by pool + app + key
7027       auto key_it = app_it->second.find(key);
7028       if (key_it == app_it->second.end()) {
7029         ss << "application '" << app << "' on pool '" << pool_name
7030            << "' does not have key '" << key << "'";
7031         r = -ENOENT;
7032         goto reply;
7033       }
7034       ss << key_it->second << "\n";
7035       rdata.append(ss.str());
7036       ss.str("");
7037     }
7038   } else if (prefix == "osd get-require-min-compat-client") {
7039     ss << osdmap.require_min_compat_client << std::endl;
7040     rdata.append(ss.str());
7041     ss.str("");
7042     goto reply;
7043   } else if (prefix == "osd pool application enable" ||
7044              prefix == "osd pool application disable" ||
7045              prefix == "osd pool application set" ||
7046              prefix == "osd pool application rm") {
7047     bool changed = false;
7048     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7049     if (r != 0) {
7050       // Error, reply.
7051       goto reply;
7052     } else if (changed) {
7053       // Valid mutation, proceed to prepare phase
7054       return false;
7055     } else {
7056       // Idempotent case, reply
7057       goto reply;
7058     }
7059   } else {
7060     // try prepare update
7061     return false;
7062   }
7063
7064  reply:
7065   string rs;
7066   getline(ss, rs);
7067   mon.reply_command(op, r, rs, rdata, get_last_committed());
7068   return true;
7069 }
7070
7071 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7072 {
7073   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7074     osdmap.get_pg_pool(pool_id));
7075   ceph_assert(pool);
7076   pool->set_flag(flags);
7077 }
7078
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7080 {
7081   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7082     osdmap.get_pg_pool(pool_id));
7083   ceph_assert(pool);
7084   pool->unset_flag(flags);
7085 }
7086
7087 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7088 {
7089   char k[80];
7090   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7091   return k;
7092 }
7093
7094 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7095 {
7096   char k[80];
7097   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7098            (unsigned long long)pool, (unsigned long long)snap);
7099   return k;
7100 }
7101
7102 string OSDMonitor::make_purged_snap_key_value(
7103   int64_t pool, snapid_t snap, snapid_t num,
7104   epoch_t epoch, bufferlist *v)
7105 {
7106   // encode the *last* epoch in the key so that we can use forward
7107   // iteration only to search for an epoch in an interval.
7108   encode(snap, *v);
7109   encode(snap + num, *v);
7110   encode(epoch, *v);
7111   return make_purged_snap_key(pool, snap + num - 1);
7112 }
7113
7114
7115 int OSDMonitor::lookup_purged_snap(
7116   int64_t pool, snapid_t snap,
7117   snapid_t *begin, snapid_t *end)
7118 {
7119   string k = make_purged_snap_key(pool, snap);
7120   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7121   it->lower_bound(k);
7122   if (!it->valid()) {
7123     dout(20) << __func__
7124              << " pool " << pool << " snap " << snap
7125              << " - key '" << k << "' not found" << dendl;
7126     return -ENOENT;
7127   }
7128   if (it->key().find("purged_snap_") != 0) {
7129     dout(20) << __func__
7130              << " pool " << pool << " snap " << snap
7131              << " - key '" << k << "' got '" << it->key()
7132              << "', wrong prefix" << dendl;
7133     return -ENOENT;
7134   }
7135   string gotk = it->key();
7136   const char *format = "purged_snap_%llu_";
7137   long long int keypool;
7138   int n = sscanf(gotk.c_str(), format, &keypool);
7139   if (n != 1) {
7140     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7141     return -ENOENT;
7142   }
7143   if (pool != keypool) {
7144     dout(20) << __func__
7145              << " pool " << pool << " snap " << snap
7146              << " - key '" << k << "' got '" << gotk
7147              << "', wrong pool " << keypool
7148              << dendl;
7149     return -ENOENT;
7150   }
7151   bufferlist v = it->value();
7152   auto p = v.cbegin();
7153   decode(*begin, p);
7154   decode(*end, p);
7155   if (snap < *begin || snap >= *end) {
7156     dout(20) << __func__
7157              << " pool " << pool << " snap " << snap
7158              << " - found [" << *begin << "," << *end << "), no overlap"
7159              << dendl;
7160     return -ENOENT;
7161   }
7162   return 0;
7163 }
7164
7165 void OSDMonitor::insert_purged_snap_update(
7166   int64_t pool,
7167   snapid_t start, snapid_t end,
7168   epoch_t epoch,
7169   MonitorDBStore::TransactionRef t)
7170 {
7171   snapid_t before_begin, before_end;
7172   snapid_t after_begin, after_end;
7173   int b = lookup_purged_snap(pool, start - 1,
7174                              &before_begin, &before_end);
7175   int a = lookup_purged_snap(pool, end,
7176                              &after_begin, &after_end);
7177   if (!b && !a) {
7178     dout(10) << __func__
7179              << " [" << start << "," << end << ") - joins ["
7180              << before_begin << "," << before_end << ") and ["
7181              << after_begin << "," << after_end << ")" << dendl;
7182     // erase only the begin record; we'll overwrite the end one.
7183     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7184     bufferlist v;
7185     string k = make_purged_snap_key_value(pool,
7186                                           before_begin, after_end - before_begin,
7187                                           pending_inc.epoch, &v);
7188     t->put(OSD_SNAP_PREFIX, k, v);
7189   } else if (!b) {
7190     dout(10) << __func__
7191              << " [" << start << "," << end << ") - join with earlier ["
7192              << before_begin << "," << before_end << ")" << dendl;
7193     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7194     bufferlist v;
7195     string k = make_purged_snap_key_value(pool,
7196                                           before_begin, end - before_begin,
7197                                           pending_inc.epoch, &v);
7198     t->put(OSD_SNAP_PREFIX, k, v);
7199   } else if (!a) {
7200     dout(10) << __func__
7201              << " [" << start << "," << end << ") - join with later ["
7202              << after_begin << "," << after_end << ")" << dendl;
7203     // overwrite after record
7204     bufferlist v;
7205     string k = make_purged_snap_key_value(pool,
7206                                           start, after_end - start,
7207                                           pending_inc.epoch, &v);
7208     t->put(OSD_SNAP_PREFIX, k, v);
7209   } else {
7210     dout(10) << __func__
7211              << " [" << start << "," << end << ") - new"
7212              << dendl;
7213     bufferlist v;
7214     string k = make_purged_snap_key_value(pool,
7215                                           start, end - start,
7216                                           pending_inc.epoch, &v);
7217     t->put(OSD_SNAP_PREFIX, k, v);
7218   }
7219 }
7220
7221 bool OSDMonitor::try_prune_purged_snaps()
7222 {
7223   if (!mon.mgrstatmon()->is_readable()) {
7224     return false;
7225   }
7226   if (!pending_inc.new_purged_snaps.empty()) {
7227     return false;  // we already pruned for this epoch
7228   }
7229
7230   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7231     "mon_max_snap_prune_per_epoch");
7232   if (!max_prune) {
7233     max_prune = 100000;
7234   }
7235   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7236
7237   unsigned actually_pruned = 0;
7238   auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7239   for (auto& p : osdmap.get_pools()) {
7240     auto q = purged_snaps.find(p.first);
7241     if (q == purged_snaps.end()) {
7242       continue;
7243     }
7244     auto& purged = q->second;
7245     if (purged.empty()) {
7246       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7247       continue;
7248     }
7249     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7250     snap_interval_set_t to_prune;
7251     unsigned maybe_pruned = actually_pruned;
7252     for (auto i = purged.begin(); i != purged.end(); ++i) {
7253       snapid_t begin = i.get_start();
7254       auto end = i.get_start() + i.get_len();
7255       snapid_t pbegin = 0, pend = 0;
7256       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7257       if (r == 0) {
7258         // already purged.
7259         // be a bit aggressive about backing off here, because the mon may
7260         // do a lot of work going through this set, and if we know the
7261         // purged set from the OSDs is at least *partly* stale we may as
7262         // well wait for it to be fresh.
7263         dout(20) << __func__ << "  we've already purged " << pbegin
7264                  << "~" << (pend - pbegin) << dendl;
7265         break;  // next pool
7266       }
7267       if (pbegin && pbegin > begin && pbegin < end) {
7268         // the tail of [begin,end) is purged; shorten the range
7269         end = pbegin;
7270       }
7271       to_prune.insert(begin, end - begin);
7272       maybe_pruned += end - begin;
7273       if (maybe_pruned >= max_prune) {
7274         break;
7275       }
7276     }
7277     if (!to_prune.empty()) {
7278       // PGs may still be reporting things as purged that we have already
7279       // pruned from removed_snaps_queue.
7280       snap_interval_set_t actual;
7281       auto r = osdmap.removed_snaps_queue.find(p.first);
7282       if (r != osdmap.removed_snaps_queue.end()) {
7283         actual.intersection_of(to_prune, r->second);
7284       }
7285       actually_pruned += actual.size();
7286       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7287                << ", actual pruned " << actual << dendl;
7288       if (!actual.empty()) {
7289         pending_inc.new_purged_snaps[p.first].swap(actual);
7290       }
7291     }
7292     if (actually_pruned >= max_prune) {
7293       break;
7294     }
7295   }
7296   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7297   return !!actually_pruned;
7298 }
7299
7300 bool OSDMonitor::update_pools_status()
7301 {
7302   if (!mon.mgrstatmon()->is_readable())
7303     return false;
7304
7305   bool ret = false;
7306
7307   auto& pools = osdmap.get_pools();
7308   for (auto it = pools.begin(); it != pools.end(); ++it) {
7309     const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7310     if (!pstat)
7311       continue;
7312     const object_stat_sum_t& sum = pstat->stats.sum;
7313     const pg_pool_t &pool = it->second;
7314     const string& pool_name = osdmap.get_pool_name(it->first);
7315
7316     bool pool_is_full =
7317       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7318       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7319
7320     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7321       if (pool_is_full)
7322         continue;
7323
7324       mon.clog->info() << "pool '" << pool_name
7325                        << "' no longer out of quota; removing NO_QUOTA flag";
7326       // below we cancel FLAG_FULL too, we'll set it again in
7327       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328       clear_pool_flags(it->first,
7329                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7330       ret = true;
7331     } else {
7332       if (!pool_is_full)
7333         continue;
7334
7335       if (pool.quota_max_bytes > 0 &&
7336           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7337         mon.clog->warn() << "pool '" << pool_name << "' is full"
7338                          << " (reached quota's max_bytes: "
7339                          << byte_u_t(pool.quota_max_bytes) << ")";
7340       }
7341       if (pool.quota_max_objects > 0 &&
7342                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7343         mon.clog->warn() << "pool '" << pool_name << "' is full"
7344                          << " (reached quota's max_objects: "
7345                          << pool.quota_max_objects << ")";
7346       }
7347       // set both FLAG_FULL_QUOTA and FLAG_FULL
7348       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349       // since FLAG_FULL should always take precedence
7350       set_pool_flags(it->first,
7351                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7352       clear_pool_flags(it->first,
7353                        pg_pool_t::FLAG_NEARFULL |
7354                        pg_pool_t::FLAG_BACKFILLFULL);
7355       ret = true;
7356     }
7357   }
7358   return ret;
7359 }
7360
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7362 {
7363   op->mark_osdmon_event(__func__);
7364   auto m = op->get_req<MPoolOp>();
7365   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7366   MonSession *session = op->get_session();
7367   if (!session)
7368     return -EPERM;
7369   string erasure_code_profile;
7370   stringstream ss;
7371   string rule_name;
7372   bool bulk = false;
7373   int ret = 0;
7374   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7375                          0, 0, 0, 0, 0, 0, 0.0,
7376                          erasure_code_profile,
7377                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7378                          cct->_conf.get_val<bool>("osd_pool_default_crimson"),
7379                          &ss);
7380
7381   if (ret < 0) {
7382     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7383   }
7384   return ret;
7385 }
7386
7387 int OSDMonitor::crush_rename_bucket(const string& srcname,
7388                                     const string& dstname,
7389                                     ostream *ss)
7390 {
7391   int ret;
7392   //
7393   // Avoid creating a pending crush if it does not already exists and
7394   // the rename would fail.
7395   //
7396   if (!_have_pending_crush()) {
7397     ret = _get_stable_crush().can_rename_bucket(srcname,
7398                                                 dstname,
7399                                                 ss);
7400     if (ret)
7401       return ret;
7402   }
7403
7404   CrushWrapper newcrush = _get_pending_crush();
7405
7406   ret = newcrush.rename_bucket(srcname,
7407                                dstname,
7408                                ss);
7409   if (ret)
7410     return ret;
7411
7412   pending_inc.crush.clear();
7413   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7414   *ss << "renamed bucket " << srcname << " into " << dstname;
7415   return 0;
7416 }
7417
7418 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7419 {
7420   string replacement = "";
7421
7422   if (plugin == "jerasure_generic" ||
7423       plugin == "jerasure_sse3" ||
7424       plugin == "jerasure_sse4" ||
7425       plugin == "jerasure_neon") {
7426     replacement = "jerasure";
7427   } else if (plugin == "shec_generic" ||
7428              plugin == "shec_sse3" ||
7429              plugin == "shec_sse4" ||
7430              plugin == "shec_neon") {
7431     replacement = "shec";
7432   }
7433
7434   if (replacement != "") {
7435     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7436             << plugin << " that has been deprecated. Please use "
7437             << replacement << " instead." << dendl;
7438   }
7439 }
7440
7441 int OSDMonitor::normalize_profile(const string& profilename,
7442                                   ErasureCodeProfile &profile,
7443                                   bool force,
7444                                   ostream *ss)
7445 {
7446   ErasureCodeInterfaceRef erasure_code;
7447   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7448   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7449   check_legacy_ec_plugin(plugin->second, profilename);
7450   int err = instance.factory(plugin->second,
7451                              g_conf().get_val<std::string>("erasure_code_dir"),
7452                              profile, &erasure_code, ss);
7453   if (err) {
7454     return err;
7455   }
7456
7457   err = erasure_code->init(profile, ss);
7458   if (err) {
7459     return err;
7460   }
7461
7462   auto it = profile.find("stripe_unit");
7463   if (it != profile.end()) {
7464     string err_str;
7465     uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7466     if (!err_str.empty()) {
7467       *ss << "could not parse stripe_unit '" << it->second
7468           << "': " << err_str << std::endl;
7469       return -EINVAL;
7470     }
7471     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7472     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7473     if (chunk_size != stripe_unit) {
7474       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7475           << "alignment. Would be padded to " << chunk_size
7476           << std::endl;
7477       return -EINVAL;
7478     }
7479     if ((stripe_unit % 4096) != 0 && !force) {
7480       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7481           << "use --force to override this check" << std::endl;
7482       return -EINVAL;
7483     }
7484   }
7485   return 0;
7486 }
7487
7488 int OSDMonitor::crush_rule_create_erasure(const string &name,
7489                                              const string &profile,
7490                                              int *rule,
7491                                              ostream *ss)
7492 {
7493   int ruleid = osdmap.crush->get_rule_id(name);
7494   if (ruleid != -ENOENT) {
7495     *rule = ruleid;
7496     return -EEXIST;
7497   }
7498
7499   CrushWrapper newcrush = _get_pending_crush();
7500
7501   ruleid = newcrush.get_rule_id(name);
7502   if (ruleid != -ENOENT) {
7503     *rule = ruleid;
7504     return -EALREADY;
7505   } else {
7506     ErasureCodeInterfaceRef erasure_code;
7507     int err = get_erasure_code(profile, &erasure_code, ss);
7508     if (err) {
7509       *ss << "failed to load plugin using profile " << profile << std::endl;
7510       return err;
7511     }
7512
7513     err = erasure_code->create_rule(name, newcrush, ss);
7514     erasure_code.reset();
7515     if (err < 0)
7516       return err;
7517     *rule = err;
7518     pending_inc.crush.clear();
7519     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7520     return 0;
7521   }
7522 }
7523
7524 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7525                                  ErasureCodeInterfaceRef *erasure_code,
7526                                  ostream *ss) const
7527 {
7528   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7529     return -EAGAIN;
7530   ErasureCodeProfile profile =
7531     osdmap.get_erasure_code_profile(erasure_code_profile);
7532   ErasureCodeProfile::const_iterator plugin =
7533     profile.find("plugin");
7534   if (plugin == profile.end()) {
7535     *ss << "cannot determine the erasure code plugin"
7536         << " because there is no 'plugin' entry in the erasure_code_profile "
7537         << profile << std::endl;
7538     return -EINVAL;
7539   }
7540   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7541   auto& instance = ErasureCodePluginRegistry::instance();
7542   return instance.factory(plugin->second,
7543                           g_conf().get_val<std::string>("erasure_code_dir"),
7544                           profile, erasure_code, ss);
7545 }
7546
7547 int OSDMonitor::check_cluster_features(uint64_t features,
7548                                        stringstream &ss)
7549 {
7550   stringstream unsupported_ss;
7551   int unsupported_count = 0;
7552   if ((mon.get_quorum_con_features() & features) != features) {
7553     unsupported_ss << "the monitor cluster";
7554     ++unsupported_count;
7555   }
7556
7557   set<int32_t> up_osds;
7558   osdmap.get_up_osds(up_osds);
7559   for (set<int32_t>::iterator it = up_osds.begin();
7560        it != up_osds.end(); ++it) {
7561     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7562     if ((xi.features & features) != features) {
7563       if (unsupported_count > 0)
7564         unsupported_ss << ", ";
7565       unsupported_ss << "osd." << *it;
7566       unsupported_count ++;
7567     }
7568   }
7569
7570   if (unsupported_count > 0) {
7571     ss << "features " << features << " unsupported by: "
7572        << unsupported_ss.str();
7573     return -ENOTSUP;
7574   }
7575
7576   // check pending osd state, too!
7577   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7578          pending_inc.new_xinfo.begin();
7579        p != pending_inc.new_xinfo.end(); ++p) {
7580     const osd_xinfo_t &xi = p->second;
7581     if ((xi.features & features) != features) {
7582       dout(10) << __func__ << " pending osd." << p->first
7583                << " features are insufficient; retry" << dendl;
7584       return -EAGAIN;
7585     }
7586   }
7587
7588   return 0;
7589 }
7590
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7592                                                  stringstream& ss)
7593 {
7594   OSDMap::Incremental new_pending = pending_inc;
7595   encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7596   OSDMap newmap;
7597   newmap.deepish_copy_from(osdmap);
7598   newmap.apply_incremental(new_pending);
7599
7600   // client compat
7601   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7602     auto mv = newmap.get_min_compat_client();
7603     if (mv > newmap.require_min_compat_client) {
7604       ss << "new crush map requires client version " << mv
7605          << " but require_min_compat_client is "
7606          << newmap.require_min_compat_client;
7607       return false;
7608     }
7609   }
7610
7611   // osd compat
7612   uint64_t features =
7613     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7614     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7615   stringstream features_ss;
7616   int r = check_cluster_features(features, features_ss);
7617   if (r) {
7618     ss << "Could not change CRUSH: " << features_ss.str();
7619     return false;
7620   }
7621
7622   return true;
7623 }
7624
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7627   const string &profile,
7628   ostream *ss)
7629 {
7630   bool found = false;
7631   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7632        p != pools.end();
7633        ++p) {
7634     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7635       *ss << osdmap.pool_name[p->first] << " ";
7636       found = true;
7637     }
7638   }
7639   if (found) {
7640     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7641   }
7642   return found;
7643 }
7644
7645 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7646                                            map<string,string> *erasure_code_profile_map,
7647                                            ostream *ss)
7648 {
7649   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7650                                    get_json_str_map,
7651                                    *ss,
7652                                    erasure_code_profile_map,
7653                                    true);
7654   if (r)
7655     return r;
7656   ceph_assert((*erasure_code_profile_map).count("plugin"));
7657   string default_plugin = (*erasure_code_profile_map)["plugin"];
7658   map<string,string> user_map;
7659   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7660        i != erasure_code_profile.end();
7661        ++i) {
7662     size_t equal = i->find('=');
7663     if (equal == string::npos) {
7664       user_map[*i] = string();
7665       (*erasure_code_profile_map)[*i] = string();
7666     } else {
7667       const string key = i->substr(0, equal);
7668       equal++;
7669       const string value = i->substr(equal);
7670       if (key.find("ruleset-") == 0) {
7671         *ss << "property '" << key << "' is no longer supported; try "
7672             << "'crush-" << key.substr(8) << "' instead";
7673         return -EINVAL;
7674       }
7675       user_map[key] = value;
7676       (*erasure_code_profile_map)[key] = value;
7677     }
7678   }
7679
7680   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7681     (*erasure_code_profile_map) = user_map;
7682
7683   return 0;
7684 }
7685
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7687                                   const string &erasure_code_profile,
7688                                   uint8_t repl_size,
7689                                   unsigned *size, unsigned *min_size,
7690                                   ostream *ss)
7691 {
7692   int err = 0;
7693   bool set_min_size = false;
7694   switch (pool_type) {
7695   case pg_pool_t::TYPE_REPLICATED:
7696     if (osdmap.stretch_mode_enabled) {
7697       if (repl_size == 0)
7698         repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7699       if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7700         *ss << "prepare_pool_size: we are in stretch mode but size "
7701            << repl_size << " does not match!";
7702         return -EINVAL;
7703       }
7704       *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7705       set_min_size = true;
7706     }
7707     if (repl_size == 0) {
7708       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7709     }
7710     *size = repl_size;
7711     if (!set_min_size)
7712       *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7713     break;
7714   case pg_pool_t::TYPE_ERASURE:
7715     {
7716       if (osdmap.stretch_mode_enabled) {
7717         *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7718         return -EINVAL;
7719       }
7720       ErasureCodeInterfaceRef erasure_code;
7721       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7722       if (err == 0) {
7723         *size = erasure_code->get_chunk_count();
7724         *min_size =
7725           erasure_code->get_data_chunk_count() +
7726           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7727         assert(*min_size <= *size);
7728         assert(*min_size >= erasure_code->get_data_chunk_count());
7729       }
7730     }
7731     break;
7732   default:
7733     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7734     err = -EINVAL;
7735     break;
7736   }
7737   return err;
7738 }
7739
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7741                                           const string &erasure_code_profile,
7742                                           uint32_t *stripe_width,
7743                                           ostream *ss)
7744 {
7745   int err = 0;
7746   switch (pool_type) {
7747   case pg_pool_t::TYPE_REPLICATED:
7748     // ignored
7749     break;
7750   case pg_pool_t::TYPE_ERASURE:
7751     {
7752       ErasureCodeProfile profile =
7753         osdmap.get_erasure_code_profile(erasure_code_profile);
7754       ErasureCodeInterfaceRef erasure_code;
7755       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7756       if (err)
7757         break;
7758       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7759       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760       auto it = profile.find("stripe_unit");
7761       if (it != profile.end()) {
7762         string err_str;
7763         stripe_unit = strict_iecstrtoll(it->second, &err_str);
7764         ceph_assert(err_str.empty());
7765       }
7766       *stripe_width = data_chunks *
7767         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7768     }
7769     break;
7770   default:
7771     *ss << "prepare_pool_stripe_width: "
7772        << pool_type << " is not a known pool type";
7773     err = -EINVAL;
7774     break;
7775   }
7776   return err;
7777 }
7778
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7780 {
7781   /* we don't write down the stretch rule anywhere, so
7782    * we have to guess it. How? Look at all the pools
7783    * and count up how many times a given rule is used
7784    * on stretch pools and then return the one with
7785    * the most users!
7786    */
7787   map<int,int> rule_counts;
7788   for (const auto& pooli : osdmap.pools) {
7789     const pg_pool_t& p = pooli.second;
7790     if (p.is_replicated() && p.is_stretch_pool()) {
7791       if (!rule_counts.count(p.crush_rule)) {
7792         rule_counts[p.crush_rule] = 1;
7793       } else {
7794         ++rule_counts[p.crush_rule];
7795       }
7796     }
7797   }
7798
7799   if (rule_counts.empty()) {
7800     return -ENOENT;
7801   }
7802
7803   int most_used_count = 0;
7804   int most_used_rule = -1;
7805   for (auto i : rule_counts) {
7806     if (i.second > most_used_count) {
7807       most_used_rule = i.first;
7808       most_used_count = i.second;
7809     }
7810   }
7811   ceph_assert(most_used_count > 0);
7812   ceph_assert(most_used_rule >= 0);
7813   return most_used_rule;
7814 }
7815
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7817                                         const string &erasure_code_profile,
7818                                         const string &rule_name,
7819                                         int *crush_rule,
7820                                         ostream *ss)
7821 {
7822
7823   if (*crush_rule < 0) {
7824     switch (pool_type) {
7825     case pg_pool_t::TYPE_REPLICATED:
7826       {
7827         if (rule_name == "") {
7828           if (osdmap.stretch_mode_enabled) {
7829             *crush_rule = get_replicated_stretch_crush_rule();
7830           } else {
7831             // Use default rule
7832             *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7833           }
7834           if (*crush_rule < 0) {
7835             // Errors may happen e.g. if no valid rule is available
7836             *ss << "No suitable CRUSH rule exists, check "
7837                 << "'osd pool default crush *' config options";
7838             return -ENOENT;
7839           }
7840         } else {
7841           return get_crush_rule(rule_name, crush_rule, ss);
7842         }
7843       }
7844       break;
7845     case pg_pool_t::TYPE_ERASURE:
7846       {
7847         int err = crush_rule_create_erasure(rule_name,
7848                                                erasure_code_profile,
7849                                                crush_rule, ss);
7850         switch (err) {
7851         case -EALREADY:
7852           dout(20) << "prepare_pool_crush_rule: rule "
7853                    << rule_name << " try again" << dendl;
7854           // fall through
7855         case 0:
7856           // need to wait for the crush rule to be proposed before proceeding
7857           err = -EAGAIN;
7858           break;
7859         case -EEXIST:
7860           err = 0;
7861           break;
7862         }
7863         return err;
7864       }
7865       break;
7866     default:
7867       *ss << "prepare_pool_crush_rule: " << pool_type
7868          << " is not a known pool type";
7869       return -EINVAL;
7870     }
7871   } else {
7872     if (!osdmap.crush->rule_exists(*crush_rule)) {
7873       *ss << "CRUSH rule " << *crush_rule << " not found";
7874       return -ENOENT;
7875     }
7876   }
7877
7878   return 0;
7879 }
7880
7881 int OSDMonitor::get_crush_rule(const string &rule_name,
7882                                int *crush_rule,
7883                                ostream *ss)
7884 {
7885   int ret;
7886   ret = osdmap.crush->get_rule_id(rule_name);
7887   if (ret != -ENOENT) {
7888     // found it, use it
7889     *crush_rule = ret;
7890   } else {
7891     CrushWrapper newcrush = _get_pending_crush();
7892
7893     ret = newcrush.get_rule_id(rule_name);
7894     if (ret != -ENOENT) {
7895       // found it, wait for it to be proposed
7896       dout(20) << __func__ << ": rule " << rule_name
7897                << " try again" << dendl;
7898       return -EAGAIN;
7899     } else {
7900       // Cannot find it , return error
7901       *ss << "specified rule " << rule_name << " doesn't exist";
7902       return ret;
7903     }
7904   }
7905   return 0;
7906 }
7907
7908 /*
7909 * Get the number of 'in' osds according to the crush_rule,
7910 */
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
7912 {
7913   set<int> out_osds;
7914   set<int> crush_in_osds;
7915   set<int> roots;
7916   CrushWrapper newcrush = _get_pending_crush();
7917   newcrush.find_takes_by_rule(crush_rule, &roots);
7918   for (auto root : roots) {
7919     const char *rootname = newcrush.get_item_name(root);
7920     set<int> crush_all_osds;
7921     newcrush.get_leaves(rootname, &crush_all_osds);
7922     std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
7923                         out_osds.begin(), out_osds.end(),
7924                         std::inserter(crush_in_osds, crush_in_osds.end()));
7925   }
7926   return crush_in_osds.size();
7927 }
7928
7929 int OSDMonitor::check_pg_num(int64_t pool,
7930                              int pg_num,
7931                              int size,
7932                              int crush_rule,
7933                              ostream *ss)
7934 {
7935   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7936   uint64_t projected = 0;
7937   uint32_t osd_num_by_crush = 0;
7938   set<int64_t> crush_pool_ids;
7939   if (pool < 0) {
7940     // a new pool
7941     projected += pg_num * size;
7942   }
7943
7944   osd_num_by_crush = get_osd_num_by_crush(crush_rule);
7945   osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
7946
7947   for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7948     // Check only for pools affected by crush rule
7949     if (crush_pool_ids.contains(pool_id)) {
7950       if (pool_id == pool) {
7951         // Specified pool, use given pg_num and size values.
7952         projected += pg_num * size;
7953       } else {
7954         // Use pg_num_target for evaluating the projected pg num
7955         projected += pool_info.get_pg_num_target() * pool_info.get_size();
7956       }
7957     }
7958   }
7959   // assume min cluster size 3
7960   osd_num_by_crush = std::max(osd_num_by_crush, 3u);
7961   auto projected_pgs_per_osd = projected / osd_num_by_crush;
7962
7963   if (projected_pgs_per_osd > max_pgs_per_osd) {
7964     if (pool >= 0) {
7965       *ss << "pool id " << pool;
7966     }
7967     *ss << " pg_num " << pg_num
7968         << " size " << size
7969         << " for this pool would result in "
7970         << projected_pgs_per_osd
7971         << " cumulative PGs per OSD (" << projected
7972         << " total PG replicas on " << osd_num_by_crush
7973         << " 'in' root OSDs by crush rule) "
7974         << "which exceeds the mon_max_pg_per_osd "
7975         << "value of " << max_pgs_per_osd;
7976     return -ERANGE;
7977   }
7978   return 0;
7979 }
7980
7981 /**
7982  * @param name The name of the new pool
7983  * @param crush_rule The crush rule to use. If <0, will use the system default
7984  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985  * @param pg_num The pg_num to use. If set to 0, will use the system default
7986  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987  * @param pg_num_min min pg_num
7988  * @param pg_num_max max pg_num
7989  * @param repl_size Replication factor, or 0 for default
7990  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991  * @param pool_type TYPE_ERASURE, or TYPE_REP
7992  * @param expected_num_objects expected number of objects on the pool
7993  * @param fast_read fast read type.
7994  * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995  * @param bool bulk indicates whether pool should be a bulk pool
7996  * @param bool crimson indicates whether pool is a crimson pool
7997  * @param ss human readable error message, if any.
7998  *
7999  * @return 0 on success, negative errno on failure.
8000  */
8001 int OSDMonitor::prepare_new_pool(string& name,
8002                                  int crush_rule,
8003                                  const string &crush_rule_name,
8004                                  unsigned pg_num, unsigned pgp_num,
8005                                  unsigned pg_num_min,
8006                                  unsigned pg_num_max,
8007                                  const uint64_t repl_size,
8008                                  const uint64_t target_size_bytes,
8009                                  const float target_size_ratio,
8010                                  const string &erasure_code_profile,
8011                                  const unsigned pool_type,
8012                                  const uint64_t expected_num_objects,
8013                                  FastReadType fast_read,
8014                                  string pg_autoscale_mode,
8015                                  bool bulk,
8016                                  bool crimson,
8017                                  ostream *ss)
8018 {
8019   if (crimson && pg_autoscale_mode.empty()) {
8020     // default pg_autoscale_mode to off for crimson, we'll error out below if
8021     // the user tried to actually set pg_autoscale_mode to something other than
8022     // "off"
8023     pg_autoscale_mode = "off";
8024   }
8025
8026   if (name.length() == 0)
8027     return -EINVAL;
8028
8029   if (pg_num == 0) {
8030     auto pg_num_from_mode =
8031       [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8032       (const string& mode) {
8033       return mode == "on" ? 1 : pg_num;
8034     };
8035     pg_num = pg_num_from_mode(
8036       pg_autoscale_mode.empty() ?
8037       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8038       pg_autoscale_mode);
8039   }
8040   if (pgp_num == 0)
8041     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8042   if (!pgp_num)
8043     pgp_num = pg_num;
8044   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8045     *ss << "'pg_num' must be greater than 0 and less than or equal to "
8046         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8047         << " (you may adjust 'mon max pool pg num' for higher values)";
8048     return -ERANGE;
8049   }
8050   if (pgp_num > pg_num) {
8051     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052         << ", which in this case is " << pg_num;
8053     return -ERANGE;
8054   }
8055
8056   if (crimson) {
8057     /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058      * be static.  User must also have specified set-allow-crimson */
8059     const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
8060     if (pool_type != pg_pool_t::TYPE_REPLICATED) {
8061       *ss << "crimson-osd only supports replicated pools" << suffix;
8062       return -EINVAL;
8063     } else if (pg_autoscale_mode != "off") {
8064       *ss << "crimson-osd does not support changing pg_num or pgp_num, "
8065           << "pg_autoscale_mode must be set to 'off'" << suffix;
8066       return -EINVAL;
8067     } else if (!osdmap.get_allow_crimson()) {
8068       *ss << "set-allow-crimson must be set to create a pool with the "
8069           << "crimson flag" << suffix;
8070       return -EINVAL;
8071     }
8072   }
8073
8074   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8075     *ss << "'fast_read' can only apply to erasure coding pool";
8076     return -EINVAL;
8077   }
8078   int r;
8079   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8080                                  crush_rule_name, &crush_rule, ss);
8081   if (r) {
8082     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8083     return r;
8084   }
8085   unsigned size, min_size;
8086   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8087                         &size, &min_size, ss);
8088   if (r) {
8089     dout(10) << "prepare_pool_size returns " << r << dendl;
8090     return r;
8091   }
8092   if (g_conf()->mon_osd_crush_smoke_test) {
8093     CrushWrapper newcrush = _get_pending_crush();
8094     ostringstream err;
8095     CrushTester tester(newcrush, err);
8096     tester.set_min_x(0);
8097     tester.set_max_x(50);
8098     tester.set_rule(crush_rule);
8099     tester.set_num_rep(size);
8100     auto start = ceph::coarse_mono_clock::now();
8101     r = tester.test_with_fork(cct, g_conf()->mon_lease);
8102     dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
8103     auto duration = ceph::coarse_mono_clock::now() - start;
8104     if (r < 0) {
8105       dout(10) << "tester.test_with_fork returns " << r
8106                << ": " << err.str() << dendl;
8107       *ss << "crush test failed with " << r << ": " << err.str();
8108       return r;
8109     }
8110     dout(10) << __func__ << " crush smoke test duration: "
8111              << duration << dendl;
8112   }
8113   r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8114   if (r) {
8115     dout(10) << "check_pg_num returns " << r << dendl;
8116     return r;
8117   }
8118
8119   if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8120     *ss << "crush rule " << crush_rule << " type does not match pool";
8121     return -EINVAL;
8122   }
8123
8124   uint32_t stripe_width = 0;
8125   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8126   if (r) {
8127     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8128     return r;
8129   }
8130
8131   bool fread = false;
8132   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8133     switch (fast_read) {
8134       case FAST_READ_OFF:
8135         fread = false;
8136         break;
8137       case FAST_READ_ON:
8138         fread = true;
8139         break;
8140       case FAST_READ_DEFAULT:
8141         fread = g_conf()->osd_pool_default_ec_fast_read;
8142         break;
8143       default:
8144         *ss << "invalid fast_read setting: " << fast_read;
8145         return -EINVAL;
8146     }
8147   }
8148
8149   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8150        p != pending_inc.new_pool_names.end();
8151        ++p) {
8152     if (p->second == name)
8153       return 0;
8154   }
8155
8156   if (-1 == pending_inc.new_pool_max)
8157     pending_inc.new_pool_max = osdmap.pool_max;
8158   int64_t pool = ++pending_inc.new_pool_max;
8159   pg_pool_t empty;
8160   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8161   pi->create_time = ceph_clock_now();
8162   pi->type = pool_type;
8163   pi->fast_read = fread;
8164   pi->flags = g_conf()->osd_pool_default_flags;
8165   if (bulk) {
8166     pi->set_flag(pg_pool_t::FLAG_BULK);
8167   } else if (g_conf()->osd_pool_default_flag_bulk) {
8168       pi->set_flag(pg_pool_t::FLAG_BULK);
8169   }
8170   if (g_conf()->osd_pool_default_flag_hashpspool)
8171     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8172   if (g_conf()->osd_pool_default_flag_nodelete)
8173     pi->set_flag(pg_pool_t::FLAG_NODELETE);
8174   if (g_conf()->osd_pool_default_flag_nopgchange)
8175     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8176   if (g_conf()->osd_pool_default_flag_nosizechange)
8177     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8178   pi->set_flag(pg_pool_t::FLAG_CREATING);
8179   if (g_conf()->osd_pool_use_gmt_hitset)
8180     pi->use_gmt_hitset = true;
8181   else
8182     pi->use_gmt_hitset = false;
8183   if (crimson) {
8184     pi->set_flag(pg_pool_t::FLAG_CRIMSON);
8185     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8186   }
8187
8188   pi->size = size;
8189   pi->min_size = min_size;
8190   pi->crush_rule = crush_rule;
8191   pi->expected_num_objects = expected_num_objects;
8192   pi->object_hash = CEPH_STR_HASH_RJENKINS;
8193   if (osdmap.stretch_mode_enabled) {
8194     pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8195     pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8196     pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8197     pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8198     if (osdmap.degraded_stretch_mode) {
8199       pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8200       pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8201       // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202       // TODO: drat, we don't record this ^ anywhere, though given that it
8203       // necessarily won't exist elsewhere it likely doesn't matter
8204       pi->min_size = pi->min_size / 2;
8205       pi->size = pi->size / 2; // only support 2 zones now
8206     }
8207   }
8208
8209   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8210         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8211       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8212     pi->pg_autoscale_mode = m;
8213   } else {
8214     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8215   }
8216   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8217   pi->set_pg_num(
8218     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8219     : pg_num);
8220   pi->set_pg_num_pending(pi->get_pg_num());
8221   pi->set_pg_num_target(pg_num);
8222   pi->set_pgp_num(pi->get_pg_num());
8223   pi->set_pgp_num_target(pgp_num);
8224   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8225       pg_num_min) {
8226     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8227   }
8228   if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8229       pg_num_max) {
8230     pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8231   }
8232   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8233         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8234     pi->pg_autoscale_mode = m;
8235   }
8236
8237   pi->last_change = pending_inc.epoch;
8238   pi->auid = 0;
8239
8240   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8241       pi->erasure_code_profile = erasure_code_profile;
8242   } else {
8243       pi->erasure_code_profile = "";
8244   }
8245   pi->stripe_width = stripe_width;
8246
8247   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8248       target_size_bytes) {
8249     // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250     // larger than int32_t max.
8251     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8252   }
8253   if (target_size_ratio > 0.0 &&
8254       osdmap.require_osd_release >= ceph_release_t::nautilus) {
8255     // only store for nautilus+, just to be consistent and tidy.
8256     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8257   }
8258
8259   pi->cache_target_dirty_ratio_micro =
8260     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8261   pi->cache_target_dirty_high_ratio_micro =
8262     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8263   pi->cache_target_full_ratio_micro =
8264     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8265   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8266   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8267
8268   pending_inc.new_pool_names[pool] = name;
8269   return 0;
8270 }
8271
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8273 {
8274   op->mark_osdmon_event(__func__);
8275   ostringstream ss;
8276   if (pending_inc.new_flags < 0)
8277     pending_inc.new_flags = osdmap.get_flags();
8278   pending_inc.new_flags |= flag;
8279   ss << OSDMap::get_flag_string(flag) << " is set";
8280   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8281                                                     get_last_committed() + 1));
8282   return true;
8283 }
8284
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8286 {
8287   op->mark_osdmon_event(__func__);
8288   ostringstream ss;
8289   if (pending_inc.new_flags < 0)
8290     pending_inc.new_flags = osdmap.get_flags();
8291   pending_inc.new_flags &= ~flag;
8292   ss << OSDMap::get_flag_string(flag) << " is unset";
8293   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8294                                                     get_last_committed() + 1));
8295   return true;
8296 }
8297
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8299                                          stringstream& ss)
8300 {
8301   string poolstr;
8302   cmd_getval(cmdmap, "pool", poolstr);
8303   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8304   if (pool < 0) {
8305     ss << "unrecognized pool '" << poolstr << "'";
8306     return -ENOENT;
8307   }
8308   string var;
8309   cmd_getval(cmdmap, "var", var);
8310
8311   pg_pool_t p = *osdmap.get_pg_pool(pool);
8312   if (pending_inc.new_pools.count(pool))
8313     p = pending_inc.new_pools[pool];
8314
8315   // accept val as a json string in the normal case (current
8316   // generation monitor).  parse out int or float values from the
8317   // string as needed.  however, if it is not a string, try to pull
8318   // out an int, in case an older monitor with an older json schema is
8319   // forwarding a request.
8320   string val;
8321   string interr, floaterr;
8322   int64_t n = 0;
8323   double f = 0;
8324   int64_t uf = 0;  // micro-f
8325   cmd_getval(cmdmap, "val", val);
8326
8327   auto si_options = {
8328     "target_max_objects"
8329   };
8330   auto iec_options = {
8331     "target_max_bytes",
8332     "target_size_bytes",
8333     "compression_max_blob_size",
8334     "compression_min_blob_size",
8335     "csum_max_block",
8336     "csum_min_block",
8337   };
8338   if (count(begin(si_options), end(si_options), var)) {
8339     n = strict_si_cast<int64_t>(val, &interr);
8340   } else if (count(begin(iec_options), end(iec_options), var)) {
8341     n = strict_iec_cast<int64_t>(val, &interr);
8342   } else {
8343     // parse string as both int and float; different fields use different types.
8344     n = strict_strtoll(val.c_str(), 10, &interr);
8345     f = strict_strtod(val.c_str(), &floaterr);
8346     uf = llrintl(f * (double)1000000.0);
8347   }
8348
8349   if (!p.is_tier() &&
8350       (var == "hit_set_type" || var == "hit_set_period" ||
8351        var == "hit_set_count" || var == "hit_set_fpp" ||
8352        var == "target_max_objects" || var == "target_max_bytes" ||
8353        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8354        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8355        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8356        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8357        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8358     return -EACCES;
8359   }
8360
8361   if (var == "size") {
8362     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8363       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8364       return -EPERM;
8365     }
8366     if (p.type == pg_pool_t::TYPE_ERASURE) {
8367       ss << "can not change the size of an erasure-coded pool";
8368       return -ENOTSUP;
8369     }
8370     if (interr.length()) {
8371       ss << "error parsing integer value '" << val << "': " << interr;
8372       return -EINVAL;
8373     }
8374     if (n <= 0 || n > 10) {
8375       ss << "pool size must be between 1 and 10";
8376       return -EINVAL;
8377     }
8378     if (n == 1) {
8379       if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8380         ss << "configuring pool size as 1 is disabled by default.";
8381         return -EPERM;
8382       }
8383       bool sure = false;
8384       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8385       if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8386         "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387           "pass the flag --yes-i-really-mean-it.";
8388         return -EPERM;
8389       }
8390     }
8391     if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8392       ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8393       return -EINVAL;
8394     }
8395     if (n > p.size) {
8396       // only when increasing pool size
8397       int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8398       if (r < 0) {
8399         return r;
8400       }
8401     }
8402     p.size = n;
8403     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8404   } else if (var == "min_size") {
8405     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8406       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8407       return -EPERM;
8408     }
8409     if (interr.length()) {
8410       ss << "error parsing integer value '" << val << "': " << interr;
8411       return -EINVAL;
8412     }
8413
8414     if (p.type != pg_pool_t::TYPE_ERASURE) {
8415       if (n < 1 || n > p.size) {
8416         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8417         return -EINVAL;
8418       }
8419     } else {
8420        ErasureCodeInterfaceRef erasure_code;
8421        int k;
8422        stringstream tmp;
8423        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8424        if (err == 0) {
8425          k = erasure_code->get_data_chunk_count();
8426        } else {
8427          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8428          return err;
8429        }
8430
8431        if (n < k || n > p.size) {
8432          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8433          return -EINVAL;
8434        }
8435     }
8436     p.min_size = n;
8437   } else if (var == "pg_num_actual") {
8438     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8439       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8440       return -EPERM;
8441     }
8442     if (interr.length()) {
8443       ss << "error parsing integer value '" << val << "': " << interr;
8444       return -EINVAL;
8445     }
8446     if (n == (int)p.get_pg_num()) {
8447       return 0;
8448     }
8449     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8450       ss << "'pg_num' must be greater than 0 and less than or equal to "
8451          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8452          << " (you may adjust 'mon max pool pg num' for higher values)";
8453       return -ERANGE;
8454     }
8455     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8456       ss << "cannot adjust pg_num while initial PGs are being created";
8457       return -EBUSY;
8458     }
8459     if (n > (int)p.get_pg_num()) {
8460       if (p.get_pg_num() != p.get_pg_num_pending()) {
8461         // force pre-nautilus clients to resend their ops, since they
8462         // don't understand pg_num_pending changes form a new interval
8463         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8464       }
8465       p.set_pg_num(n);
8466     } else {
8467       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8468         ss << "nautilus OSDs are required to adjust pg_num_pending";
8469         return -EPERM;
8470       }
8471       if (n < (int)p.get_pgp_num()) {
8472         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8473         return -EINVAL;
8474       }
8475       if (n < (int)p.get_pg_num() - 1) {
8476         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8477            << ") - 1; only single pg decrease is currently supported";
8478         return -EINVAL;
8479       }
8480       p.set_pg_num_pending(n);
8481       // force pre-nautilus clients to resend their ops, since they
8482       // don't understand pg_num_pending changes form a new interval
8483       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8484     }
8485     // force pre-luminous clients to resend their ops, since they
8486     // don't understand that split PGs now form a new interval.
8487     p.last_force_op_resend_preluminous = pending_inc.epoch;
8488   } else if (var == "pg_num") {
8489     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8490       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8491       return -EPERM;
8492     }
8493     if (interr.length()) {
8494       ss << "error parsing integer value '" << val << "': " << interr;
8495       return -EINVAL;
8496     }
8497     if (n == (int)p.get_pg_num_target()) {
8498       return 0;
8499     }
8500     if (n <= 0 || static_cast<uint64_t>(n) >
8501                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8502       ss << "'pg_num' must be greater than 0 and less than or equal to "
8503          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8504          << " (you may adjust 'mon max pool pg num' for higher values)";
8505       return -ERANGE;
8506     }
8507     if (n > (int)p.get_pg_num_target()) {
8508       int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8509       if (r) {
8510         return r;
8511       }
8512       bool force = false;
8513       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8514       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8515         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8516         return -EPERM;
8517       }
8518     } else {
8519       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8520         ss << "nautilus OSDs are required to decrease pg_num";
8521         return -EPERM;
8522       }
8523     }
8524     int64_t pg_min = 0, pg_max = 0;
8525     p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8526     p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8527     if (pg_min && n < pg_min) {
8528       ss << "specified pg_num " << n
8529          << " < pg_num_min " << pg_min;
8530       return -EINVAL;
8531     }
8532     if (pg_max && n > pg_max) {
8533       ss << "specified pg_num " << n
8534          << " < pg_num_max " << pg_max;
8535       return -EINVAL;
8536     }
8537     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8538       // pre-nautilus osdmap format; increase pg_num directly
8539       assert(n > (int)p.get_pg_num());
8540       // force pre-nautilus clients to resend their ops, since they
8541       // don't understand pg_num_target changes form a new interval
8542       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8543       // force pre-luminous clients to resend their ops, since they
8544       // don't understand that split PGs now form a new interval.
8545       p.last_force_op_resend_preluminous = pending_inc.epoch;
8546       p.set_pg_num(n);
8547     } else {
8548       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549       // make pgp_num track pg_num if it already matches.  if it is set
8550       // differently, leave it different and let the user control it
8551       // manually.
8552       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8553         p.set_pgp_num_target(n);
8554       }
8555       p.set_pg_num_target(n);
8556     }
8557   } else if (var == "pgp_num_actual") {
8558     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8559       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8560       return -EPERM;
8561     }
8562     if (interr.length()) {
8563       ss << "error parsing integer value '" << val << "': " << interr;
8564       return -EINVAL;
8565     }
8566     if (n <= 0) {
8567       ss << "specified pgp_num must > 0, but you set to " << n;
8568       return -EINVAL;
8569     }
8570     if (n > (int)p.get_pg_num()) {
8571       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8572       return -EINVAL;
8573     }
8574     if (n > (int)p.get_pg_num_pending()) {
8575       ss << "specified pgp_num " << n
8576          << " > pg_num_pending " << p.get_pg_num_pending();
8577       return -EINVAL;
8578     }
8579     p.set_pgp_num(n);
8580   } else if (var == "pgp_num") {
8581     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8582       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8583       return -EPERM;
8584     }
8585     if (interr.length()) {
8586       ss << "error parsing integer value '" << val << "': " << interr;
8587       return -EINVAL;
8588     }
8589     if (n <= 0) {
8590       ss << "specified pgp_num must > 0, but you set to " << n;
8591       return -EINVAL;
8592     }
8593     if (n > (int)p.get_pg_num_target()) {
8594       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8595       return -EINVAL;
8596     }
8597     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8598       // pre-nautilus osdmap format; increase pgp_num directly
8599       p.set_pgp_num(n);
8600     } else {
8601       p.set_pgp_num_target(n);
8602     }
8603   } else if (var == "pg_autoscale_mode") {
8604     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8605     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8606       ss << "specified invalid mode " << val;
8607       return -EINVAL;
8608     }
8609     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8610       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8611       return -EINVAL;
8612     }
8613     p.pg_autoscale_mode = m;
8614   } else if (var == "crush_rule") {
8615     int id = osdmap.crush->get_rule_id(val);
8616     if (id == -ENOENT) {
8617       ss << "crush rule " << val << " does not exist";
8618       return -ENOENT;
8619     }
8620     if (id < 0) {
8621       ss << cpp_strerror(id);
8622       return -ENOENT;
8623     }
8624     if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8625       ss << "crush rule " << id << " type does not match pool";
8626       return -EINVAL;
8627     }
8628     p.crush_rule = id;
8629   } else if (var == "nodelete" || var == "nopgchange" ||
8630              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8631              var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8632     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8633     // make sure we only compare against 'n' if we didn't receive a string
8634     if (val == "true" || (interr.empty() && n == 1)) {
8635       p.set_flag(flag);
8636     } else if (val == "false" || (interr.empty() && n == 0)) {
8637       if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
8638         ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8639         return -EINVAL;
8640       }
8641       p.unset_flag(flag);
8642     } else {
8643       ss << "expecting value 'true', 'false', '0', or '1'";
8644       return -EINVAL;
8645     }
8646   } else if (var == "eio") {
8647     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8648
8649     // make sure we only compare against 'n' if we didn't receive a string
8650     if (val == "true" || (interr.empty() && n == 1)) {
8651       p.set_flag(flag);
8652     } else if (val == "false" || (interr.empty() && n == 0)) {
8653       p.unset_flag(flag);
8654     } else {
8655       ss << "expecting value 'true', 'false', '0', or '1'";
8656       return -EINVAL;
8657     }
8658   } else if (var == "hashpspool") {
8659     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8660     bool force = false;
8661     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8662
8663     if (!force) {
8664       ss << "are you SURE?  this will remap all placement groups in this pool,"
8665             " this triggers large data movement,"
8666             " pass --yes-i-really-mean-it if you really do.";
8667       return -EPERM;
8668     }
8669     // make sure we only compare against 'n' if we didn't receive a string
8670     if (val == "true" || (interr.empty() && n == 1)) {
8671       p.set_flag(flag);
8672     } else if (val == "false" || (interr.empty() && n == 0)) {
8673       p.unset_flag(flag);
8674     } else {
8675       ss << "expecting value 'true', 'false', '0', or '1'";
8676       return -EINVAL;
8677     }
8678   } else if (var == "hit_set_type") {
8679     if (val == "none")
8680       p.hit_set_params = HitSet::Params();
8681     else {
8682       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8683       if (err)
8684         return err;
8685       if (val == "bloom") {
8686         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8687         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8688         p.hit_set_params = HitSet::Params(bsp);
8689       } else if (val == "explicit_hash")
8690         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8691       else if (val == "explicit_object")
8692         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8693       else {
8694         ss << "unrecognized hit_set type '" << val << "'";
8695         return -EINVAL;
8696       }
8697     }
8698   } else if (var == "hit_set_period") {
8699     if (interr.length()) {
8700       ss << "error parsing integer value '" << val << "': " << interr;
8701       return -EINVAL;
8702     } else if (n < 0) {
8703       ss << "hit_set_period should be non-negative";
8704       return -EINVAL;
8705     }
8706     p.hit_set_period = n;
8707   } else if (var == "hit_set_count") {
8708     if (interr.length()) {
8709       ss << "error parsing integer value '" << val << "': " << interr;
8710       return -EINVAL;
8711     } else if (n < 0) {
8712       ss << "hit_set_count should be non-negative";
8713       return -EINVAL;
8714     }
8715     p.hit_set_count = n;
8716   } else if (var == "hit_set_fpp") {
8717     if (floaterr.length()) {
8718       ss << "error parsing floating point value '" << val << "': " << floaterr;
8719       return -EINVAL;
8720     } else if (f < 0 || f > 1.0) {
8721       ss << "hit_set_fpp should be in the range 0..1";
8722       return -EINVAL;
8723     }
8724     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8725       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8726       return -EINVAL;
8727     }
8728     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8729     bloomp->set_fpp(f);
8730   } else if (var == "use_gmt_hitset") {
8731     if (val == "true" || (interr.empty() && n == 1)) {
8732       p.use_gmt_hitset = true;
8733     } else {
8734       ss << "expecting value 'true' or '1'";
8735       return -EINVAL;
8736     }
8737   } else if (var == "allow_ec_overwrites") {
8738     if (!p.is_erasure()) {
8739       ss << "ec overwrites can only be enabled for an erasure coded pool";
8740       return -EINVAL;
8741     }
8742     stringstream err;
8743     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8744         !is_pool_currently_all_bluestore(pool, p, &err)) {
8745       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8746       return -EINVAL;
8747     }
8748     if (val == "true" || (interr.empty() && n == 1)) {
8749         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8750     } else if (val == "false" || (interr.empty() && n == 0)) {
8751       ss << "ec overwrites cannot be disabled once enabled";
8752       return -EINVAL;
8753     } else {
8754       ss << "expecting value 'true', 'false', '0', or '1'";
8755       return -EINVAL;
8756     }
8757   } else if (var == "target_max_objects") {
8758     if (interr.length()) {
8759       ss << "error parsing int '" << val << "': " << interr;
8760       return -EINVAL;
8761     }
8762     p.target_max_objects = n;
8763   } else if (var == "target_max_bytes") {
8764     if (interr.length()) {
8765       ss << "error parsing int '" << val << "': " << interr;
8766       return -EINVAL;
8767     }
8768     p.target_max_bytes = n;
8769   } else if (var == "cache_target_dirty_ratio") {
8770     if (floaterr.length()) {
8771       ss << "error parsing float '" << val << "': " << floaterr;
8772       return -EINVAL;
8773     }
8774     if (f < 0 || f > 1.0) {
8775       ss << "value must be in the range 0..1";
8776       return -ERANGE;
8777     }
8778     p.cache_target_dirty_ratio_micro = uf;
8779   } else if (var == "cache_target_dirty_high_ratio") {
8780     if (floaterr.length()) {
8781       ss << "error parsing float '" << val << "': " << floaterr;
8782       return -EINVAL;
8783     }
8784     if (f < 0 || f > 1.0) {
8785       ss << "value must be in the range 0..1";
8786       return -ERANGE;
8787     }
8788     p.cache_target_dirty_high_ratio_micro = uf;
8789   } else if (var == "cache_target_full_ratio") {
8790     if (floaterr.length()) {
8791       ss << "error parsing float '" << val << "': " << floaterr;
8792       return -EINVAL;
8793     }
8794     if (f < 0 || f > 1.0) {
8795       ss << "value must be in the range 0..1";
8796       return -ERANGE;
8797     }
8798     p.cache_target_full_ratio_micro = uf;
8799   } else if (var == "cache_min_flush_age") {
8800     if (interr.length()) {
8801       ss << "error parsing int '" << val << "': " << interr;
8802       return -EINVAL;
8803     }
8804     p.cache_min_flush_age = n;
8805   } else if (var == "cache_min_evict_age") {
8806     if (interr.length()) {
8807       ss << "error parsing int '" << val << "': " << interr;
8808       return -EINVAL;
8809     }
8810     p.cache_min_evict_age = n;
8811   } else if (var == "min_read_recency_for_promote") {
8812     if (interr.length()) {
8813       ss << "error parsing integer value '" << val << "': " << interr;
8814       return -EINVAL;
8815     }
8816     p.min_read_recency_for_promote = n;
8817   } else if (var == "hit_set_grade_decay_rate") {
8818     if (interr.length()) {
8819       ss << "error parsing integer value '" << val << "': " << interr;
8820       return -EINVAL;
8821     }
8822     if (n > 100 || n < 0) {
8823       ss << "value out of range,valid range is 0 - 100";
8824       return -EINVAL;
8825     }
8826     p.hit_set_grade_decay_rate = n;
8827   } else if (var == "hit_set_search_last_n") {
8828     if (interr.length()) {
8829       ss << "error parsing integer value '" << val << "': " << interr;
8830       return -EINVAL;
8831     }
8832     if (n > p.hit_set_count || n < 0) {
8833       ss << "value out of range,valid range is 0 - hit_set_count";
8834       return -EINVAL;
8835     }
8836     p.hit_set_search_last_n = n;
8837   } else if (var == "min_write_recency_for_promote") {
8838     if (interr.length()) {
8839       ss << "error parsing integer value '" << val << "': " << interr;
8840       return -EINVAL;
8841     }
8842     p.min_write_recency_for_promote = n;
8843   } else if (var == "fast_read") {
8844     if (p.is_replicated()) {
8845         ss << "fast read is not supported in replication pool";
8846         return -EINVAL;
8847     }
8848     if (val == "true" || (interr.empty() && n == 1)) {
8849       p.fast_read = true;
8850     } else if (val == "false" || (interr.empty() && n == 0)) {
8851       p.fast_read = false;
8852     } else {
8853       ss << "expecting value 'true', 'false', '0', or '1'";
8854       return -EINVAL;
8855     }
8856   } else if (pool_opts_t::is_opt_name(var)) {
8857     bool unset = val == "unset";
8858     if (var == "compression_mode") {
8859       if (!unset) {
8860         auto cmode = Compressor::get_comp_mode_type(val);
8861         if (!cmode) {
8862           ss << "unrecognized compression mode '" << val << "'";
8863           return -EINVAL;
8864         }
8865       }
8866     } else if (var == "compression_algorithm") {
8867       if (!unset) {
8868         auto alg = Compressor::get_comp_alg_type(val);
8869         if (!alg) {
8870           ss << "unrecognized compression_algorithm '" << val << "'";
8871           return -EINVAL;
8872         }
8873       }
8874     } else if (var == "compression_required_ratio") {
8875       if (floaterr.length()) {
8876         ss << "error parsing float value '" << val << "': " << floaterr;
8877         return -EINVAL;
8878       }
8879       if (f < 0 || f > 1) {
8880         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8881         return -EINVAL;
8882       }
8883     } else if (var == "csum_type") {
8884       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8885       if (t < 0 ) {
8886         ss << "unrecognized csum_type '" << val << "'";
8887         return -EINVAL;
8888       }
8889       //preserve csum_type numeric value
8890       n = t;
8891       interr.clear();
8892     } else if (var == "compression_max_blob_size" ||
8893                var == "compression_min_blob_size" ||
8894                var == "csum_max_block" ||
8895                var == "csum_min_block") {
8896       if (interr.length()) {
8897         ss << "error parsing int value '" << val << "': " << interr;
8898         return -EINVAL;
8899       }
8900     } else if (var == "fingerprint_algorithm") {
8901       if (!unset) {
8902         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8903         if (!alg) {
8904           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8905           return -EINVAL;
8906         }
8907       }
8908     } else if (var == "target_size_bytes") {
8909       if (interr.length()) {
8910         ss << "error parsing unit value '" << val << "': " << interr;
8911         return -EINVAL;
8912       }
8913       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8914         ss << "must set require_osd_release to nautilus or "
8915            << "later before setting target_size_bytes";
8916         return -EINVAL;
8917       }
8918     } else if (var == "target_size_ratio") {
8919       if (f < 0.0) {
8920         ss << "target_size_ratio cannot be negative";
8921         return -EINVAL;
8922       }
8923     } else if (var == "pg_num_min") {
8924       if (interr.length()) {
8925         ss << "error parsing int value '" << val << "': " << interr;
8926         return -EINVAL;
8927       }
8928       if (n > (int)p.get_pg_num_target()) {
8929         ss << "specified pg_num_min " << n
8930            << " > pg_num " << p.get_pg_num_target();
8931         return -EINVAL;
8932       }
8933     } else if (var == "pg_num_max") {
8934       if (interr.length()) {
8935         ss << "error parsing int value '" << val << "': " << interr;
8936         return -EINVAL;
8937       }
8938       if (n && n < (int)p.get_pg_num_target()) {
8939         ss << "specified pg_num_max " << n
8940            << " < pg_num " << p.get_pg_num_target();
8941         return -EINVAL;
8942       }
8943     } else if (var == "recovery_priority") {
8944       if (interr.length()) {
8945         ss << "error parsing int value '" << val << "': " << interr;
8946         return -EINVAL;
8947       }
8948       if (!g_conf()->debug_allow_any_pool_priority) {
8949         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8950           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951              << " and " << OSD_POOL_PRIORITY_MAX;
8952           return -EINVAL;
8953         }
8954       }
8955     } else if (var == "pg_autoscale_bias") {
8956       if (f < 0.0 || f > 1000.0) {
8957         ss << "pg_autoscale_bias must be between 0 and 1000";
8958         return -EINVAL;
8959       }
8960     } else if (var == "dedup_tier") {
8961       if (interr.empty()) {
8962         ss << "expecting value 'pool name'";
8963         return -EINVAL;
8964       }
8965       // Current base tier in dedup does not support ec pool
8966       if (p.is_erasure()) {
8967         ss << "pool '" << poolstr
8968            << "' is an ec pool, which cannot be a base tier";
8969         return -ENOTSUP;
8970       }
8971       int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8972       if (lowtierpool_id < 0) {
8973         ss << "unrecognized pool '" << val << "'";
8974         return -ENOENT;
8975       }
8976       const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8977       ceph_assert(tp);
8978       n = lowtierpool_id;
8979       // The original input is string (pool name), but we convert it to int64_t.
8980       // So, clear interr
8981       interr.clear();
8982     } else if (var == "dedup_chunk_algorithm") {
8983       if (!unset) {
8984         auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8985         if (!alg) {
8986           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8987           return -EINVAL;
8988         }
8989       }
8990     } else if (var == "dedup_cdc_chunk_size") {
8991       if (interr.length()) {
8992         ss << "error parsing int value '" << val << "': " << interr;
8993         return -EINVAL;
8994       }
8995     }
8996
8997     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8998     switch (desc.type) {
8999     case pool_opts_t::STR:
9000       if (unset) {
9001         p.opts.unset(desc.key);
9002       } else {
9003         p.opts.set(desc.key, static_cast<std::string>(val));
9004       }
9005       break;
9006     case pool_opts_t::INT:
9007       if (interr.length()) {
9008         ss << "error parsing integer value '" << val << "': " << interr;
9009         return -EINVAL;
9010       }
9011       if (n == 0) {
9012         p.opts.unset(desc.key);
9013       } else {
9014         p.opts.set(desc.key, static_cast<int64_t>(n));
9015       }
9016       break;
9017     case pool_opts_t::DOUBLE:
9018       if (floaterr.length()) {
9019         ss << "error parsing floating point value '" << val << "': " << floaterr;
9020         return -EINVAL;
9021       }
9022       if (f == 0) {
9023         p.opts.unset(desc.key);
9024       } else {
9025         p.opts.set(desc.key, static_cast<double>(f));
9026       }
9027       break;
9028     default:
9029       ceph_assert(!"unknown type");
9030     }
9031   } else {
9032     ss << "unrecognized variable '" << var << "'";
9033     return -EINVAL;
9034   }
9035   if (val != "unset") {
9036     ss << "set pool " << pool << " " << var << " to " << val;
9037   } else {
9038     ss << "unset pool " << pool << " " << var;
9039   }
9040   p.last_change = pending_inc.epoch;
9041   pending_inc.new_pools[pool] = p;
9042   return 0;
9043 }
9044
9045 int OSDMonitor::prepare_command_pool_application(const string &prefix,
9046                                                  const cmdmap_t& cmdmap,
9047                                                  stringstream& ss)
9048 {
9049   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
9050 }
9051
9052 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
9053                                                     const cmdmap_t& cmdmap,
9054                                                     stringstream& ss,
9055                                                     bool *modified)
9056 {
9057   return _command_pool_application(prefix, cmdmap, ss, modified, false);
9058 }
9059
9060
9061 /**
9062  * Common logic for preprocess and prepare phases of pool application
9063  * tag commands.  In preprocess mode we're only detecting invalid
9064  * commands, and determining whether it was a modification or a no-op.
9065  * In prepare mode we're actually updating the pending state.
9066  */
9067 int OSDMonitor::_command_pool_application(const string &prefix,
9068                                           const cmdmap_t& cmdmap,
9069                                           stringstream& ss,
9070                                           bool *modified,
9071                                           bool preparing)
9072 {
9073   string pool_name;
9074   cmd_getval(cmdmap, "pool", pool_name);
9075   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9076   if (pool < 0) {
9077     ss << "unrecognized pool '" << pool_name << "'";
9078     return -ENOENT;
9079   }
9080
9081   pg_pool_t p = *osdmap.get_pg_pool(pool);
9082   if (preparing) {
9083     if (pending_inc.new_pools.count(pool)) {
9084       p = pending_inc.new_pools[pool];
9085     }
9086   }
9087
9088   string app;
9089   cmd_getval(cmdmap, "app", app);
9090   bool app_exists = (p.application_metadata.count(app) > 0);
9091
9092   string key;
9093   cmd_getval(cmdmap, "key", key);
9094   if (key == "all") {
9095     ss << "key cannot be 'all'";
9096     return -EINVAL;
9097   }
9098
9099   string value;
9100   cmd_getval(cmdmap, "value", value);
9101   if (value == "all") {
9102     ss << "value cannot be 'all'";
9103     return -EINVAL;
9104   }
9105
9106   if (boost::algorithm::ends_with(prefix, "enable")) {
9107     if (app.empty()) {
9108       ss << "application name must be provided";
9109       return -EINVAL;
9110     }
9111
9112     if (p.is_tier()) {
9113       ss << "application must be enabled on base tier";
9114       return -EINVAL;
9115     }
9116
9117     bool force = false;
9118     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9119
9120     if (!app_exists && !p.application_metadata.empty() && !force) {
9121       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9122          << "application; pass --yes-i-really-mean-it to proceed anyway";
9123       return -EPERM;
9124     }
9125
9126     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9127       ss << "too many enabled applications on pool '" << pool_name << "'; "
9128          << "max " << MAX_POOL_APPLICATIONS;
9129       return -EINVAL;
9130     }
9131
9132     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9133       ss << "application name '" << app << "' too long; max length "
9134          << MAX_POOL_APPLICATION_LENGTH;
9135       return -EINVAL;
9136     }
9137
9138     if (!app_exists) {
9139       p.application_metadata[app] = {};
9140     }
9141     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9142
9143   } else if (boost::algorithm::ends_with(prefix, "disable")) {
9144     bool force = false;
9145     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9146
9147     if (!force) {
9148       ss << "Are you SURE? Disabling an application within a pool might result "
9149          << "in loss of application functionality; pass "
9150          << "--yes-i-really-mean-it to proceed anyway";
9151       return -EPERM;
9152     }
9153
9154     if (!app_exists) {
9155       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9156          << "'";
9157       return 0; // idempotent
9158     }
9159
9160     p.application_metadata.erase(app);
9161     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9162
9163   } else if (boost::algorithm::ends_with(prefix, "set")) {
9164     if (p.is_tier()) {
9165       ss << "application metadata must be set on base tier";
9166       return -EINVAL;
9167     }
9168
9169     if (!app_exists) {
9170       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9171          << "'";
9172       return -ENOENT;
9173     }
9174
9175     string key;
9176     cmd_getval(cmdmap, "key", key);
9177
9178     if (key.empty()) {
9179       ss << "key must be provided";
9180       return -EINVAL;
9181     }
9182
9183     auto &app_keys = p.application_metadata[app];
9184     if (app_keys.count(key) == 0 &&
9185         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9186       ss << "too many keys set for application '" << app << "' on pool '"
9187          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9188       return -EINVAL;
9189     }
9190
9191     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9192       ss << "key '" << app << "' too long; max length "
9193          << MAX_POOL_APPLICATION_LENGTH;
9194       return -EINVAL;
9195     }
9196
9197     string value;
9198     cmd_getval(cmdmap, "value", value);
9199     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9200       ss << "value '" << value << "' too long; max length "
9201          << MAX_POOL_APPLICATION_LENGTH;
9202       return -EINVAL;
9203     }
9204
9205     p.application_metadata[app][key] = value;
9206     ss << "set application '" << app << "' key '" << key << "' to '"
9207        << value << "' on pool '" << pool_name << "'";
9208   } else if (boost::algorithm::ends_with(prefix, "rm")) {
9209     if (!app_exists) {
9210       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9211          << "'";
9212       return -ENOENT;
9213     }
9214
9215     string key;
9216     cmd_getval(cmdmap, "key", key);
9217     auto it = p.application_metadata[app].find(key);
9218     if (it == p.application_metadata[app].end()) {
9219       ss << "application '" << app << "' on pool '" << pool_name
9220          << "' does not have key '" << key << "'";
9221       return 0; // idempotent
9222     }
9223
9224     p.application_metadata[app].erase(it);
9225     ss << "removed application '" << app << "' key '" << key << "' on pool '"
9226        << pool_name << "'";
9227   } else {
9228     ceph_abort();
9229   }
9230
9231   if (preparing) {
9232     p.last_change = pending_inc.epoch;
9233     pending_inc.new_pools[pool] = p;
9234   }
9235
9236   // Because we fell through this far, we didn't hit no-op cases,
9237   // so pool was definitely modified
9238   if (modified != nullptr) {
9239     *modified = true;
9240   }
9241
9242   return 0;
9243 }
9244
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246     CrushWrapper &newcrush,
9247     int32_t id,
9248     int32_t ancestor,
9249     bool has_ancestor,
9250     bool unlink_only)
9251 {
9252   int err = 0;
9253
9254   if (has_ancestor) {
9255     err = newcrush.remove_item_under(cct, id, ancestor,
9256         unlink_only);
9257   } else {
9258     err = newcrush.remove_item(cct, id, unlink_only);
9259   }
9260   return err;
9261 }
9262
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9264 {
9265   pending_inc.crush.clear();
9266   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9267 }
9268
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270     CrushWrapper &newcrush,
9271     int32_t id,
9272     int32_t ancestor,
9273     bool has_ancestor,
9274     bool unlink_only)
9275 {
9276   int err = _prepare_command_osd_crush_remove(
9277       newcrush, id, ancestor,
9278       has_ancestor, unlink_only);
9279
9280   if (err < 0)
9281     return err;
9282
9283   ceph_assert(err == 0);
9284   do_osd_crush_remove(newcrush);
9285
9286   return 0;
9287 }
9288
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9290 {
9291   if (osdmap.is_up(id)) {
9292     return -EBUSY;
9293   }
9294
9295   pending_inc.new_state[id] = osdmap.get_state(id);
9296   pending_inc.new_uuid[id] = uuid_d();
9297   pending_metadata_rm.insert(id);
9298   pending_metadata.erase(id);
9299
9300   return 0;
9301 }
9302
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9304 {
9305   ceph_assert(existing_id);
9306   *existing_id = -1;
9307
9308   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9309     if (!osdmap.exists(i) &&
9310         pending_inc.new_up_client.count(i) == 0 &&
9311         (pending_inc.new_state.count(i) == 0 ||
9312          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9313       *existing_id = i;
9314       return -1;
9315     }
9316   }
9317
9318   if (pending_inc.new_max_osd < 0) {
9319     return osdmap.get_max_osd();
9320   }
9321   return pending_inc.new_max_osd;
9322 }
9323
9324 void OSDMonitor::do_osd_create(
9325     const int32_t id,
9326     const uuid_d& uuid,
9327     const string& device_class,
9328     int32_t* new_id)
9329 {
9330   dout(10) << __func__ << " uuid " << uuid << dendl;
9331   ceph_assert(new_id);
9332
9333   // We presume validation has been performed prior to calling this
9334   // function. We assert with prejudice.
9335
9336   int32_t allocated_id = -1; // declare here so we can jump
9337   int32_t existing_id = -1;
9338   if (!uuid.is_zero()) {
9339     existing_id = osdmap.identify_osd(uuid);
9340     if (existing_id >= 0) {
9341       ceph_assert(id < 0 || id == existing_id);
9342       *new_id = existing_id;
9343       goto out;
9344     } else if (id >= 0) {
9345       // uuid does not exist, and id has been provided, so just create
9346       // the new osd.id
9347       *new_id = id;
9348       goto out;
9349     }
9350   }
9351
9352   // allocate a new id
9353   allocated_id = _allocate_osd_id(&existing_id);
9354   dout(10) << __func__ << " allocated id " << allocated_id
9355            << " existing id " << existing_id << dendl;
9356   if (existing_id >= 0) {
9357     ceph_assert(existing_id < osdmap.get_max_osd());
9358     ceph_assert(allocated_id < 0);
9359     *new_id = existing_id;
9360   } else if (allocated_id >= 0) {
9361     ceph_assert(existing_id < 0);
9362     // raise max_osd
9363     if (pending_inc.new_max_osd < 0) {
9364       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9365     } else {
9366       ++pending_inc.new_max_osd;
9367     }
9368     *new_id = pending_inc.new_max_osd - 1;
9369     ceph_assert(*new_id == allocated_id);
9370   } else {
9371     ceph_abort_msg("unexpected condition");
9372   }
9373
9374 out:
9375   if (device_class.size()) {
9376     CrushWrapper newcrush = _get_pending_crush();
9377     if (newcrush.get_max_devices() < *new_id + 1) {
9378       newcrush.set_max_devices(*new_id + 1);
9379     }
9380     string name = string("osd.") + stringify(*new_id);
9381     if (!newcrush.item_exists(*new_id)) {
9382       newcrush.set_item_name(*new_id, name);
9383     }
9384     ostringstream ss;
9385     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9386     if (r < 0) {
9387       derr << __func__ << " failed to set " << name << " device_class "
9388            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9389            << dendl;
9390       // non-fatal... this might be a replay and we want to be idempotent.
9391     } else {
9392       dout(20) << __func__ << " set " << name << " device_class " << device_class
9393                << dendl;
9394       pending_inc.crush.clear();
9395       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9396     }
9397   } else {
9398     dout(20) << __func__ << " no device_class" << dendl;
9399   }
9400
9401   dout(10) << __func__ << " using id " << *new_id << dendl;
9402   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9403     pending_inc.new_max_osd = *new_id + 1;
9404   }
9405
9406   pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9407   // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408   // set it for us.  (ugh.)
9409   pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9410   if (!uuid.is_zero())
9411     pending_inc.new_uuid[*new_id] = uuid;
9412 }
9413
9414 int OSDMonitor::validate_osd_create(
9415     const int32_t id,
9416     const uuid_d& uuid,
9417     const bool check_osd_exists,
9418     int32_t* existing_id,
9419     stringstream& ss)
9420 {
9421
9422   dout(10) << __func__ << " id " << id << " uuid " << uuid
9423            << " check_osd_exists " << check_osd_exists << dendl;
9424
9425   ceph_assert(existing_id);
9426
9427   if (id < 0 && uuid.is_zero()) {
9428     // we have nothing to validate
9429     *existing_id = -1;
9430     return 0;
9431   } else if (uuid.is_zero()) {
9432     // we have an id but we will ignore it - because that's what
9433     // `osd create` does.
9434     return 0;
9435   }
9436
9437   /*
9438    * This function will be used to validate whether we are able to
9439    * create a new osd when the `uuid` is specified.
9440    *
9441    * It will be used by both `osd create` and `osd new`, as the checks
9442    * are basically the same when it pertains to osd id and uuid validation.
9443    * However, `osd create` presumes an `uuid` is optional, for legacy
9444    * reasons, while `osd new` requires the `uuid` to be provided. This
9445    * means that `osd create` will not be idempotent if an `uuid` is not
9446    * provided, but we will always guarantee the idempotency of `osd new`.
9447    */
9448
9449   ceph_assert(!uuid.is_zero());
9450   if (pending_inc.identify_osd(uuid) >= 0) {
9451     // osd is about to exist
9452     return -EAGAIN;
9453   }
9454
9455   int32_t i = osdmap.identify_osd(uuid);
9456   if (i >= 0) {
9457     // osd already exists
9458     if (id >= 0 && i != id) {
9459       ss << "uuid " << uuid << " already in use for different id " << i;
9460       return -EEXIST;
9461     }
9462     // return a positive errno to distinguish between a blocking error
9463     // and an error we consider to not be a problem (i.e., this would be
9464     // an idempotent operation).
9465     *existing_id = i;
9466     return EEXIST;
9467   }
9468   // i < 0
9469   if (id >= 0) {
9470     if (pending_inc.new_state.count(id)) {
9471       // osd is about to exist
9472       return -EAGAIN;
9473     }
9474     // we may not care if an osd exists if we are recreating a previously
9475     // destroyed osd.
9476     if (check_osd_exists && osdmap.exists(id)) {
9477       ss << "id " << id << " already in use and does not match uuid "
9478          << uuid;
9479       return -EINVAL;
9480     }
9481   }
9482   return 0;
9483 }
9484
9485 int OSDMonitor::prepare_command_osd_create(
9486     const int32_t id,
9487     const uuid_d& uuid,
9488     int32_t* existing_id,
9489     stringstream& ss)
9490 {
9491   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9492   ceph_assert(existing_id);
9493   if (osdmap.is_destroyed(id)) {
9494     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9495           "instead.";
9496     return -EINVAL;
9497   }
9498
9499   if (uuid.is_zero()) {
9500     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9501   }
9502
9503   return validate_osd_create(id, uuid, true, existing_id, ss);
9504 }
9505
9506 int OSDMonitor::prepare_command_osd_new(
9507     MonOpRequestRef op,
9508     const cmdmap_t& cmdmap,
9509     const map<string,string>& params,
9510     stringstream &ss,
9511     Formatter *f)
9512 {
9513   uuid_d uuid;
9514   string uuidstr;
9515   int64_t id = -1;
9516
9517   ceph_assert(paxos.is_plugged());
9518
9519   dout(10) << __func__ << " " << op << dendl;
9520
9521   /* validate command. abort now if something's wrong. */
9522
9523   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9524    *
9525    * If `id` is not specified, we will identify any existing osd based
9526    * on `uuid`. Operation will be idempotent iff secrets match.
9527    *
9528    * If `id` is specified, we will identify any existing osd based on
9529    * `uuid` and match against `id`. If they match, operation will be
9530    * idempotent iff secrets match.
9531    *
9532    * `-i secrets.json` will be optional. If supplied, will be used
9533    * to check for idempotency when `id` and `uuid` match.
9534    *
9535    * If `id` is not specified, and `uuid` does not exist, an id will
9536    * be found or allocated for the osd.
9537    *
9538    * If `id` is specified, and the osd has been previously marked
9539    * as destroyed, then the `id` will be reused.
9540    */
9541   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9542     ss << "requires the OSD's UUID to be specified.";
9543     return -EINVAL;
9544   } else if (!uuid.parse(uuidstr.c_str())) {
9545     ss << "invalid UUID value '" << uuidstr << "'.";
9546     return -EINVAL;
9547   }
9548
9549   if (cmd_getval(cmdmap, "id", id) &&
9550       (id < 0)) {
9551     ss << "invalid OSD id; must be greater or equal than zero.";
9552     return -EINVAL;
9553   }
9554
9555   // are we running an `osd create`-like command, or recreating
9556   // a previously destroyed osd?
9557
9558   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9559
9560   // we will care about `id` to assess whether osd is `destroyed`, or
9561   // to create a new osd.
9562   // we will need an `id` by the time we reach auth.
9563
9564   int32_t existing_id = -1;
9565   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9566                                 &existing_id, ss);
9567
9568   bool may_be_idempotent = false;
9569   if (err == EEXIST) {
9570     // this is idempotent from the osdmon's point-of-view
9571     may_be_idempotent = true;
9572     ceph_assert(existing_id >= 0);
9573     id = existing_id;
9574   } else if (err < 0) {
9575     return err;
9576   }
9577
9578   if (!may_be_idempotent) {
9579     // idempotency is out of the window. We are either creating a new
9580     // osd or recreating a destroyed osd.
9581     //
9582     // We now need to figure out if we have an `id` (and if it's valid),
9583     // of find an `id` if we don't have one.
9584
9585     // NOTE: we need to consider the case where the `id` is specified for
9586     // `osd create`, and we must honor it. So this means checking if
9587     // the `id` is destroyed, and if so assume the destroy; otherwise,
9588     // check if it `exists` - in which case we complain about not being
9589     // `destroyed`. In the end, if nothing fails, we must allow the
9590     // creation, so that we are compatible with `create`.
9591     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9592       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9593       ss << "OSD " << id << " has not yet been destroyed";
9594       return -EINVAL;
9595     } else if (id < 0) {
9596       // find an `id`
9597       id = _allocate_osd_id(&existing_id);
9598       if (id < 0) {
9599         ceph_assert(existing_id >= 0);
9600         id = existing_id;
9601       }
9602       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9603     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9604       dout(10) << __func__ << " recreating osd." << id << dendl;
9605     } else {
9606       dout(10) << __func__ << " creating new osd." << id << dendl;
9607     }
9608   } else {
9609     ceph_assert(id >= 0);
9610     ceph_assert(osdmap.exists(id));
9611   }
9612
9613   // we are now able to either create a brand new osd or reuse an existing
9614   // osd that has been previously destroyed.
9615
9616   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9617
9618   if (may_be_idempotent && params.empty()) {
9619     // nothing to do, really.
9620     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9621     ceph_assert(id >= 0);
9622     if (f) {
9623       f->open_object_section("created_osd");
9624       f->dump_int("osdid", id);
9625       f->close_section();
9626     } else {
9627       ss << id;
9628     }
9629     return EEXIST;
9630   }
9631
9632   string device_class;
9633   auto p = params.find("crush_device_class");
9634   if (p != params.end()) {
9635     device_class = p->second;
9636     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9637   }
9638   string cephx_secret, lockbox_secret, dmcrypt_key;
9639   bool has_lockbox = false;
9640   bool has_secrets = params.count("cephx_secret")
9641     || params.count("cephx_lockbox_secret")
9642     || params.count("dmcrypt_key");
9643
9644   KVMonitor *svc = nullptr;
9645   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9646
9647   if (has_secrets) {
9648     if (params.count("cephx_secret") == 0) {
9649       ss << "requires a cephx secret.";
9650       return -EINVAL;
9651     }
9652     cephx_secret = params.at("cephx_secret");
9653
9654     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9655     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9656
9657     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9658              << " dmcrypt " << has_dmcrypt_key << dendl;
9659
9660     if (has_lockbox_secret && has_dmcrypt_key) {
9661       has_lockbox = true;
9662       lockbox_secret = params.at("cephx_lockbox_secret");
9663       dmcrypt_key = params.at("dmcrypt_key");
9664     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9665       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9666       return -EINVAL;
9667     }
9668
9669     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9670
9671     err = mon.authmon()->validate_osd_new(id, uuid,
9672         cephx_secret,
9673         lockbox_secret,
9674         cephx_entity,
9675         lockbox_entity,
9676         ss);
9677     if (err < 0) {
9678       return err;
9679     } else if (may_be_idempotent && err != EEXIST) {
9680       // for this to be idempotent, `id` should already be >= 0; no need
9681       // to use validate_id.
9682       ceph_assert(id >= 0);
9683       ss << "osd." << id << " exists but secrets do not match";
9684       return -EEXIST;
9685     }
9686
9687     if (has_lockbox) {
9688       svc = mon.kvmon();
9689       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9690       if (err < 0) {
9691         return err;
9692       } else if (may_be_idempotent && err != EEXIST) {
9693         ceph_assert(id >= 0);
9694         ss << "osd." << id << " exists but dm-crypt key does not match.";
9695         return -EEXIST;
9696       }
9697     }
9698   }
9699   ceph_assert(!has_secrets || !cephx_secret.empty());
9700   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9701
9702   if (may_be_idempotent) {
9703     // we have nothing to do for either the osdmon or the authmon,
9704     // and we have no lockbox - so the config key service will not be
9705     // touched. This is therefore an idempotent operation, and we can
9706     // just return right away.
9707     dout(10) << __func__ << " idempotent -- no op." << dendl;
9708     ceph_assert(id >= 0);
9709     if (f) {
9710       f->open_object_section("created_osd");
9711       f->dump_int("osdid", id);
9712       f->close_section();
9713     } else {
9714       ss << id;
9715     }
9716     return EEXIST;
9717   }
9718   ceph_assert(!may_be_idempotent);
9719
9720   // perform updates.
9721   if (has_secrets) {
9722     ceph_assert(!cephx_secret.empty());
9723     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9724            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9725
9726     err = mon.authmon()->do_osd_new(cephx_entity,
9727         lockbox_entity,
9728         has_lockbox);
9729     ceph_assert(0 == err);
9730
9731     if (has_lockbox) {
9732       ceph_assert(nullptr != svc);
9733       svc->do_osd_new(uuid, dmcrypt_key);
9734     }
9735   }
9736
9737   if (is_recreate_destroyed) {
9738     ceph_assert(id >= 0);
9739     ceph_assert(osdmap.is_destroyed(id));
9740     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9741     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9742       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9743     }
9744     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9745       // due to http://tracker.ceph.com/issues/20751 some clusters may
9746       // have UP set for non-existent OSDs; make sure it is cleared
9747       // for a newly created osd.
9748       pending_inc.new_state[id] |= CEPH_OSD_UP;
9749     }
9750     pending_inc.new_uuid[id] = uuid;
9751   } else {
9752     ceph_assert(id >= 0);
9753     int32_t new_id = -1;
9754     do_osd_create(id, uuid, device_class, &new_id);
9755     ceph_assert(new_id >= 0);
9756     ceph_assert(id == new_id);
9757   }
9758
9759   if (f) {
9760     f->open_object_section("created_osd");
9761     f->dump_int("osdid", id);
9762     f->close_section();
9763   } else {
9764     ss << id;
9765   }
9766
9767   return 0;
9768 }
9769
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9771 {
9772   op->mark_osdmon_event(__func__);
9773   auto m = op->get_req<MMonCommand>();
9774   stringstream ss;
9775   cmdmap_t cmdmap;
9776   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9777     string rs = ss.str();
9778     mon.reply_command(op, -EINVAL, rs, get_last_committed());
9779     return true;
9780   }
9781
9782   MonSession *session = op->get_session();
9783   if (!session) {
9784     derr << __func__ << " no session" << dendl;
9785     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9786     return true;
9787   }
9788
9789   return prepare_command_impl(op, cmdmap);
9790 }
9791
9792 static int parse_reweights(CephContext *cct,
9793                            const cmdmap_t& cmdmap,
9794                            const OSDMap& osdmap,
9795                            map<int32_t, uint32_t>* weights)
9796 {
9797   string weights_str;
9798   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9799     return -EINVAL;
9800   }
9801   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9802   json_spirit::mValue json_value;
9803   if (!json_spirit::read(weights_str, json_value)) {
9804     return -EINVAL;
9805   }
9806   if (json_value.type() != json_spirit::obj_type) {
9807     return -EINVAL;
9808   }
9809   const auto obj = json_value.get_obj();
9810   try {
9811     for (auto& osd_weight : obj) {
9812       auto osd_id = std::stoi(osd_weight.first);
9813       if (!osdmap.exists(osd_id)) {
9814         return -ENOENT;
9815       }
9816       if (osd_weight.second.type() != json_spirit::str_type) {
9817         return -EINVAL;
9818       }
9819       auto weight = std::stoul(osd_weight.second.get_str());
9820       weights->insert({osd_id, weight});
9821     }
9822   } catch (const std::logic_error& e) {
9823     return -EINVAL;
9824   }
9825   return 0;
9826 }
9827
9828 int OSDMonitor::prepare_command_osd_destroy(
9829     int32_t id,
9830     stringstream& ss)
9831 {
9832   ceph_assert(paxos.is_plugged());
9833
9834   // we check if the osd exists for the benefit of `osd purge`, which may
9835   // have previously removed the osd. If the osd does not exist, return
9836   // -ENOENT to convey this, and let the caller deal with it.
9837   //
9838   // we presume that all auth secrets and config keys were removed prior
9839   // to this command being called. if they exist by now, we also assume
9840   // they must have been created by some other command and do not pertain
9841   // to this non-existent osd.
9842   if (!osdmap.exists(id)) {
9843     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9844     return -ENOENT;
9845   }
9846
9847   uuid_d uuid = osdmap.get_uuid(id);
9848   dout(10) << __func__ << " destroying osd." << id
9849            << " uuid " << uuid << dendl;
9850
9851   // if it has been destroyed, we assume our work here is done.
9852   if (osdmap.is_destroyed(id)) {
9853     ss << "destroyed osd." << id;
9854     return 0;
9855   }
9856
9857   EntityName cephx_entity, lockbox_entity;
9858   bool idempotent_auth = false, idempotent_cks = false;
9859
9860   int err = mon.authmon()->validate_osd_destroy(id, uuid,
9861                                                  cephx_entity,
9862                                                  lockbox_entity,
9863                                                  ss);
9864   if (err < 0) {
9865     if (err == -ENOENT) {
9866       idempotent_auth = true;
9867     } else {
9868       return err;
9869     }
9870   }
9871
9872   auto svc = mon.kvmon();
9873   err = svc->validate_osd_destroy(id, uuid);
9874   if (err < 0) {
9875     ceph_assert(err == -ENOENT);
9876     err = 0;
9877     idempotent_cks = true;
9878   }
9879
9880   if (!idempotent_auth) {
9881     err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9882     ceph_assert(0 == err);
9883   }
9884
9885   if (!idempotent_cks) {
9886     svc->do_osd_destroy(id, uuid);
9887   }
9888
9889   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9890   pending_inc.new_uuid[id] = uuid_d();
9891
9892   // we can only propose_pending() once per service, otherwise we'll be
9893   // defying PaxosService and all laws of nature. Therefore, as we may
9894   // be used during 'osd purge', let's keep the caller responsible for
9895   // proposing.
9896   ceph_assert(err == 0);
9897   return 0;
9898 }
9899
9900 int OSDMonitor::prepare_command_osd_purge(
9901     int32_t id,
9902     stringstream& ss)
9903 {
9904   ceph_assert(paxos.is_plugged());
9905   dout(10) << __func__ << " purging osd." << id << dendl;
9906
9907   ceph_assert(!osdmap.is_up(id));
9908
9909   /*
9910    * This may look a bit weird, but this is what's going to happen:
9911    *
9912    *  1. we make sure that removing from crush works
9913    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9914    *     error, then we abort the whole operation, as no updates
9915    *     have been made. However, we this function will have
9916    *     side-effects, thus we need to make sure that all operations
9917    *     performed henceforth will *always* succeed.
9918    *  3. we call `prepare_command_osd_remove()`. Although this
9919    *     function can return an error, it currently only checks if the
9920    *     osd is up - and we have made sure that it is not so, so there
9921    *     is no conflict, and it is effectively an update.
9922    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9923    *     the crush update we delayed from before.
9924    */
9925
9926   CrushWrapper newcrush = _get_pending_crush();
9927
9928   bool may_be_idempotent = false;
9929
9930   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9931   if (err == -ENOENT) {
9932     err = 0;
9933     may_be_idempotent = true;
9934   } else if (err < 0) {
9935     ss << "error removing osd." << id << " from crush";
9936     return err;
9937   }
9938
9939   // no point destroying the osd again if it has already been marked destroyed
9940   if (!osdmap.is_destroyed(id)) {
9941     err = prepare_command_osd_destroy(id, ss);
9942     if (err < 0) {
9943       if (err == -ENOENT) {
9944         err = 0;
9945       } else {
9946         return err;
9947       }
9948     } else {
9949       may_be_idempotent = false;
9950     }
9951   }
9952   ceph_assert(0 == err);
9953
9954   if (may_be_idempotent && !osdmap.exists(id)) {
9955     dout(10) << __func__ << " osd." << id << " does not exist and "
9956              << "we are idempotent." << dendl;
9957     return -ENOENT;
9958   }
9959
9960   err = prepare_command_osd_remove(id);
9961   // we should not be busy, as we should have made sure this id is not up.
9962   ceph_assert(0 == err);
9963
9964   do_osd_crush_remove(newcrush);
9965   return 0;
9966 }
9967
9968 int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
9969                            /* out */ pg_t &pgid, std::optional<string> pgids) {
9970   string pgidstr;
9971   if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
9972     ss << "unable to parse 'pgid' value '"
9973        << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
9974     return -EINVAL;
9975   }
9976   if (!pgid.parse(pgidstr.c_str())) {
9977     ss << "invalid pgid '" << pgidstr << "'";
9978     return -EINVAL;
9979   }
9980   if (!osdmap.pg_exists(pgid)) {
9981     ss << "pgid '" << pgid << "' does not exist";
9982     return -ENOENT;
9983   }
9984   if (pgids.has_value())
9985     pgids.value() = pgidstr;
9986   return 0;
9987 }
9988
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9990                                       const cmdmap_t& cmdmap)
9991 {
9992   op->mark_osdmon_event(__func__);
9993   auto m = op->get_req<MMonCommand>();
9994   bool ret = false;
9995   stringstream ss;
9996   string rs;
9997   bufferlist rdata;
9998   int err = 0;
9999
10000   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
10001   boost::scoped_ptr<Formatter> f(Formatter::create(format));
10002
10003   string prefix;
10004   cmd_getval(cmdmap, "prefix", prefix);
10005
10006   int64_t osdid;
10007   string osd_name;
10008   bool osdid_present = false;
10009   if (prefix != "osd pg-temp" &&
10010       prefix != "osd pg-upmap" &&
10011       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
10012     osdid_present = cmd_getval(cmdmap, "id", osdid);
10013   }
10014   if (osdid_present) {
10015     ostringstream oss;
10016     oss << "osd." << osdid;
10017     osd_name = oss.str();
10018   }
10019
10020   // Even if there's a pending state with changes that could affect
10021   // a command, considering that said state isn't yet committed, we
10022   // just don't care about those changes if the command currently being
10023   // handled acts as a no-op against the current committed state.
10024   // In a nutshell, we assume this command  happens *before*.
10025   //
10026   // Let me make this clearer:
10027   //
10028   //   - If we have only one client, and that client issues some
10029   //     operation that would conflict with this operation  but is
10030   //     still on the pending state, then we would be sure that said
10031   //     operation wouldn't have returned yet, so the client wouldn't
10032   //     issue this operation (unless the client didn't wait for the
10033   //     operation to finish, and that would be the client's own fault).
10034   //
10035   //   - If we have more than one client, each client will observe
10036   //     whatever is the state at the moment of the commit.  So, if we
10037   //     have two clients, one issuing an unlink and another issuing a
10038   //     link, and if the link happens while the unlink is still on the
10039   //     pending state, from the link's point-of-view this is a no-op.
10040   //     If different clients are issuing conflicting operations and
10041   //     they care about that, then the clients should make sure they
10042   //     enforce some kind of concurrency mechanism -- from our
10043   //     perspective that's what Douglas Adams would call an SEP.
10044   //
10045   // This should be used as a general guideline for most commands handled
10046   // in this function.  Adapt as you see fit, but please bear in mind that
10047   // this is the expected behavior.
10048
10049
10050   if (prefix == "osd setcrushmap" ||
10051       (prefix == "osd crush set" && !osdid_present)) {
10052     if (pending_inc.crush.length()) {
10053       dout(10) << __func__ << " waiting for pending crush update " << dendl;
10054       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
10055       return true;
10056     }
10057     dout(10) << "prepare_command setting new crush map" << dendl;
10058     bufferlist data(m->get_data());
10059     CrushWrapper crush;
10060     try {
10061       auto bl = data.cbegin();
10062       crush.decode(bl);
10063     }
10064     catch (const std::exception &e) {
10065       err = -EINVAL;
10066       ss << "Failed to parse crushmap: " << e.what();
10067       goto reply;
10068     }
10069
10070     int64_t prior_version = 0;
10071     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
10072       if (prior_version == osdmap.get_crush_version() - 1) {
10073         // see if we are a resend of the last update.  this is imperfect
10074         // (multiple racing updaters may not both get reliable success)
10075         // but we expect crush updaters (via this interface) to be rare-ish.
10076         bufferlist current, proposed;
10077         osdmap.crush->encode(current, mon.get_quorum_con_features());
10078         crush.encode(proposed, mon.get_quorum_con_features());
10079         if (current.contents_equal(proposed)) {
10080           dout(10) << __func__
10081                    << " proposed matches current and version equals previous"
10082                    << dendl;
10083           err = 0;
10084           ss << osdmap.get_crush_version();
10085           goto reply;
10086         }
10087       }
10088       if (prior_version != osdmap.get_crush_version()) {
10089         err = -EPERM;
10090         ss << "prior_version " << prior_version << " != crush version "
10091            << osdmap.get_crush_version();
10092         goto reply;
10093       }
10094     }
10095
10096     if (!validate_crush_against_features(&crush, ss)) {
10097       err = -EINVAL;
10098       goto reply;
10099     }
10100
10101     err = osdmap.validate_crush_rules(&crush, &ss);
10102     if (err < 0) {
10103       goto reply;
10104     }
10105
10106     if (g_conf()->mon_osd_crush_smoke_test) {
10107       // sanity check: test some inputs to make sure this map isn't
10108       // totally broken
10109       dout(10) << " testing map" << dendl;
10110       stringstream ess;
10111       CrushTester tester(crush, ess);
10112       tester.set_min_x(0);
10113       tester.set_max_x(50);
10114       tester.set_num_rep(3);  // arbitrary
10115       auto start = ceph::coarse_mono_clock::now();
10116       int r = tester.test_with_fork(cct, g_conf()->mon_lease);
10117       auto duration = ceph::coarse_mono_clock::now() - start;
10118       if (r < 0) {
10119         dout(10) << " tester.test_with_fork returns " << r
10120                  << ": " << ess.str() << dendl;
10121         ss << "crush smoke test failed with " << r << ": " << ess.str();
10122         err = r;
10123         goto reply;
10124       }
10125       dout(10) << __func__ << " crush somke test duration: "
10126                << duration << ", result: " << ess.str() << dendl;
10127     }
10128
10129     pending_inc.crush = data;
10130     ss << osdmap.get_crush_version() + 1;
10131     goto update;
10132
10133   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10134     CrushWrapper newcrush = _get_pending_crush();
10135     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10136       int bid = -1 - b;
10137       if (newcrush.bucket_exists(bid) &&
10138           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10139         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10140         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10141       }
10142     }
10143     if (!validate_crush_against_features(&newcrush, ss)) {
10144       err = -EINVAL;
10145       goto reply;
10146     }
10147     pending_inc.crush.clear();
10148     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10149     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10150                                               get_last_committed() + 1));
10151     return true;
10152   } else if (prefix == "osd crush set-device-class") {
10153     string device_class;
10154     if (!cmd_getval(cmdmap, "class", device_class)) {
10155       err = -EINVAL; // no value!
10156       goto reply;
10157     }
10158
10159     bool stop = false;
10160     vector<string> idvec;
10161     cmd_getval(cmdmap, "ids", idvec);
10162     CrushWrapper newcrush = _get_pending_crush();
10163     set<int> updated;
10164     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10165       set<int> osds;
10166       // wildcard?
10167       if (j == 0 &&
10168           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10169         osdmap.get_all_osds(osds);
10170         stop = true;
10171       } else {
10172         // try traditional single osd way
10173         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10174         if (osd < 0) {
10175           // ss has reason for failure
10176           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10177           err = -EINVAL;
10178           continue;
10179         }
10180         osds.insert(osd);
10181       }
10182
10183       for (auto &osd : osds) {
10184         if (!osdmap.exists(osd)) {
10185           ss << "osd." << osd << " does not exist. ";
10186           continue;
10187         }
10188
10189         ostringstream oss;
10190         oss << "osd." << osd;
10191         string name = oss.str();
10192
10193         if (newcrush.get_max_devices() < osd + 1) {
10194           newcrush.set_max_devices(osd + 1);
10195         }
10196         string action;
10197         if (newcrush.item_exists(osd)) {
10198           action = "updating";
10199         } else {
10200           action = "creating";
10201           newcrush.set_item_name(osd, name);
10202         }
10203
10204         dout(5) << action << " crush item id " << osd << " name '" << name
10205                 << "' device_class '" << device_class << "'"
10206                 << dendl;
10207         err = newcrush.update_device_class(osd, device_class, name, &ss);
10208         if (err < 0) {
10209           goto reply;
10210         }
10211         if (err == 0 && !_have_pending_crush()) {
10212           if (!stop) {
10213             // for single osd only, wildcard makes too much noise
10214             ss << "set-device-class item id " << osd << " name '" << name
10215                << "' device_class '" << device_class << "': no change. ";
10216           }
10217         } else {
10218           updated.insert(osd);
10219         }
10220       }
10221     }
10222
10223     pending_inc.crush.clear();
10224     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10225     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10226     getline(ss, rs);
10227     wait_for_finished_proposal(
10228       op,
10229       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10230     return true;
10231  } else if (prefix == "osd crush rm-device-class") {
10232     bool stop = false;
10233     vector<string> idvec;
10234     cmd_getval(cmdmap, "ids", idvec);
10235     CrushWrapper newcrush = _get_pending_crush();
10236     set<int> updated;
10237
10238     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10239       set<int> osds;
10240
10241       // wildcard?
10242       if (j == 0 &&
10243           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10244         osdmap.get_all_osds(osds);
10245         stop = true;
10246       } else {
10247         // try traditional single osd way
10248         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10249         if (osd < 0) {
10250           // ss has reason for failure
10251           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10252           err = -EINVAL;
10253           goto reply;
10254         }
10255         osds.insert(osd);
10256       }
10257
10258       for (auto &osd : osds) {
10259         if (!osdmap.exists(osd)) {
10260           ss << "osd." << osd << " does not exist. ";
10261           continue;
10262         }
10263
10264         auto class_name = newcrush.get_item_class(osd);
10265         if (!class_name) {
10266           ss << "osd." << osd << " belongs to no class, ";
10267           continue;
10268         }
10269         // note that we do not verify if class_is_in_use here
10270         // in case the device is misclassified and user wants
10271         // to overridely reset...
10272
10273         err = newcrush.remove_device_class(cct, osd, &ss);
10274         if (err < 0) {
10275           // ss has reason for failure
10276           goto reply;
10277         }
10278         updated.insert(osd);
10279       }
10280     }
10281
10282     pending_inc.crush.clear();
10283     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10284     ss << "done removing class of osd(s): " << updated;
10285     getline(ss, rs);
10286     wait_for_finished_proposal(
10287       op,
10288       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10289     return true;
10290   } else if (prefix == "osd crush class create") {
10291     string device_class;
10292     if (!cmd_getval(cmdmap, "class", device_class)) {
10293       err = -EINVAL; // no value!
10294       goto reply;
10295     }
10296     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10297       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10298          << "luminous' before using crush device classes";
10299       err = -EPERM;
10300       goto reply;
10301     }
10302     if (!_have_pending_crush() &&
10303         _get_stable_crush().class_exists(device_class)) {
10304       ss << "class '" << device_class << "' already exists";
10305       goto reply;
10306     }
10307      CrushWrapper newcrush = _get_pending_crush();
10308      if (newcrush.class_exists(device_class)) {
10309       ss << "class '" << device_class << "' already exists";
10310       goto update;
10311     }
10312     int class_id = newcrush.get_or_create_class_id(device_class);
10313     pending_inc.crush.clear();
10314     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10315     ss << "created class " << device_class << " with id " << class_id
10316        << " to crush map";
10317     goto update;
10318   } else if (prefix == "osd crush class rm") {
10319     string device_class;
10320     if (!cmd_getval(cmdmap, "class", device_class)) {
10321        err = -EINVAL; // no value!
10322        goto reply;
10323      }
10324     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10325        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10326          << "luminous' before using crush device classes";
10327        err = -EPERM;
10328        goto reply;
10329      }
10330
10331      if (!osdmap.crush->class_exists(device_class)) {
10332        err = 0;
10333        goto reply;
10334      }
10335
10336      CrushWrapper newcrush = _get_pending_crush();
10337      if (!newcrush.class_exists(device_class)) {
10338        err = 0; // make command idempotent
10339        goto wait;
10340      }
10341      int class_id = newcrush.get_class_id(device_class);
10342      stringstream ts;
10343      if (newcrush.class_is_in_use(class_id, &ts)) {
10344        err = -EBUSY;
10345        ss << "class '" << device_class << "' " << ts.str();
10346        goto reply;
10347      }
10348
10349      // check if class is used by any erasure-code-profiles
10350      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10351        osdmap.get_erasure_code_profiles();
10352      auto ec_profiles = pending_inc.get_erasure_code_profiles();
10353 #ifdef HAVE_STDLIB_MAP_SPLICING
10354      ec_profiles.merge(old_ec_profiles);
10355 #else
10356      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10357                         make_move_iterator(end(old_ec_profiles)));
10358 #endif
10359      list<string> referenced_by;
10360      for (auto &i: ec_profiles) {
10361        for (auto &j: i.second) {
10362          if ("crush-device-class" == j.first && device_class == j.second) {
10363            referenced_by.push_back(i.first);
10364          }
10365        }
10366      }
10367      if (!referenced_by.empty()) {
10368        err = -EBUSY;
10369        ss << "class '" << device_class
10370           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10371        goto reply;
10372      }
10373
10374      set<int> osds;
10375      newcrush.get_devices_by_class(device_class, &osds);
10376      for (auto& p: osds) {
10377        err = newcrush.remove_device_class(cct, p, &ss);
10378        if (err < 0) {
10379          // ss has reason for failure
10380          goto reply;
10381        }
10382      }
10383
10384      if (osds.empty()) {
10385        // empty class, remove directly
10386        err = newcrush.remove_class_name(device_class);
10387        if (err < 0) {
10388          ss << "class '" << device_class << "' cannot be removed '"
10389             << cpp_strerror(err) << "'";
10390          goto reply;
10391        }
10392      }
10393
10394      pending_inc.crush.clear();
10395      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10396      ss << "removed class " << device_class << " with id " << class_id
10397         << " from crush map";
10398      goto update;
10399   } else if (prefix == "osd crush class rename") {
10400     string srcname, dstname;
10401     if (!cmd_getval(cmdmap, "srcname", srcname)) {
10402       err = -EINVAL;
10403       goto reply;
10404     }
10405     if (!cmd_getval(cmdmap, "dstname", dstname)) {
10406       err = -EINVAL;
10407       goto reply;
10408     }
10409
10410     CrushWrapper newcrush = _get_pending_crush();
10411     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10412       // suppose this is a replay and return success
10413       // so command is idempotent
10414       ss << "already renamed to '" << dstname << "'";
10415       err = 0;
10416       goto reply;
10417     }
10418
10419     err = newcrush.rename_class(srcname, dstname);
10420     if (err < 0) {
10421       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10422          << cpp_strerror(err);
10423       goto reply;
10424     }
10425
10426     pending_inc.crush.clear();
10427     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10428     ss << "rename class '" << srcname << "' to '" << dstname << "'";
10429     goto update;
10430   } else if (prefix == "osd crush add-bucket") {
10431     // os crush add-bucket <name> <type>
10432     string name, typestr;
10433     vector<string> argvec;
10434     cmd_getval(cmdmap, "name", name);
10435     cmd_getval(cmdmap, "type", typestr);
10436     cmd_getval(cmdmap, "args", argvec);
10437     map<string,string> loc;
10438     if (!argvec.empty()) {
10439       CrushWrapper::parse_loc_map(argvec, &loc);
10440       dout(0) << "will create and move bucket '" << name
10441               << "' to location " << loc << dendl;
10442     }
10443
10444     if (!_have_pending_crush() &&
10445         _get_stable_crush().name_exists(name)) {
10446       ss << "bucket '" << name << "' already exists";
10447       goto reply;
10448     }
10449
10450     CrushWrapper newcrush = _get_pending_crush();
10451
10452     if (newcrush.name_exists(name)) {
10453       ss << "bucket '" << name << "' already exists";
10454       goto update;
10455     }
10456     int type = newcrush.get_type_id(typestr);
10457     if (type < 0) {
10458       ss << "type '" << typestr << "' does not exist";
10459       err = -EINVAL;
10460       goto reply;
10461     }
10462     if (type == 0) {
10463       ss << "type '" << typestr << "' is for devices, not buckets";
10464       err = -EINVAL;
10465       goto reply;
10466     }
10467     int bucketno;
10468     err = newcrush.add_bucket(0, 0,
10469                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10470                               NULL, &bucketno);
10471     if (err < 0) {
10472       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10473       goto reply;
10474     }
10475     err = newcrush.set_item_name(bucketno, name);
10476     if (err < 0) {
10477       ss << "error setting bucket name to '" << name << "'";
10478       goto reply;
10479     }
10480
10481     if (!loc.empty()) {
10482       if (!newcrush.check_item_loc(cct, bucketno, loc,
10483           (int *)NULL)) {
10484         err = newcrush.move_bucket(cct, bucketno, loc);
10485         if (err < 0) {
10486           ss << "error moving bucket '" << name << "' to location " << loc;
10487           goto reply;
10488         }
10489       } else {
10490         ss << "no need to move item id " << bucketno << " name '" << name
10491            << "' to location " << loc << " in crush map";
10492       }
10493     }
10494
10495     pending_inc.crush.clear();
10496     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10497     if (loc.empty()) {
10498       ss << "added bucket " << name << " type " << typestr
10499          << " to crush map";
10500     } else {
10501       ss << "added bucket " << name << " type " << typestr
10502          << " to location " << loc;
10503     }
10504     goto update;
10505   } else if (prefix == "osd crush rename-bucket") {
10506     string srcname, dstname;
10507     cmd_getval(cmdmap, "srcname", srcname);
10508     cmd_getval(cmdmap, "dstname", dstname);
10509
10510     err = crush_rename_bucket(srcname, dstname, &ss);
10511     if (err == -EALREADY) // equivalent to success for idempotency
10512       err = 0;
10513     if (err)
10514       goto reply;
10515     else
10516       goto update;
10517   } else if (prefix == "osd crush weight-set create" ||
10518              prefix == "osd crush weight-set create-compat") {
10519     if (_have_pending_crush()) {
10520       dout(10) << " first waiting for pending crush changes to commit" << dendl;
10521       goto wait;
10522     }
10523     CrushWrapper newcrush = _get_pending_crush();
10524     int64_t pool;
10525     int positions;
10526     if (newcrush.has_non_straw2_buckets()) {
10527       ss << "crush map contains one or more bucket(s) that are not straw2";
10528       err = -EPERM;
10529       goto reply;
10530     }
10531     if (prefix == "osd crush weight-set create") {
10532       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10533           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10534         ss << "require_min_compat_client "
10535            << osdmap.require_min_compat_client
10536            << " < luminous, which is required for per-pool weight-sets. "
10537            << "Try 'ceph osd set-require-min-compat-client luminous' "
10538            << "before using the new interface";
10539         err = -EPERM;
10540         goto reply;
10541       }
10542       string poolname, mode;
10543       cmd_getval(cmdmap, "pool", poolname);
10544       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10545       if (pool < 0) {
10546         ss << "pool '" << poolname << "' not found";
10547         err = -ENOENT;
10548         goto reply;
10549       }
10550       cmd_getval(cmdmap, "mode", mode);
10551       if (mode != "flat" && mode != "positional") {
10552         ss << "unrecognized weight-set mode '" << mode << "'";
10553         err = -EINVAL;
10554         goto reply;
10555       }
10556       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10557     } else {
10558       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10559       positions = 1;
10560     }
10561     if (!newcrush.create_choose_args(pool, positions)) {
10562       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10563         ss << "compat weight-set already created";
10564       } else {
10565         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10566            << "' already created";
10567       }
10568       goto reply;
10569     }
10570     pending_inc.crush.clear();
10571     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10572     goto update;
10573
10574   } else if (prefix == "osd crush weight-set rm" ||
10575              prefix == "osd crush weight-set rm-compat") {
10576     CrushWrapper newcrush = _get_pending_crush();
10577     int64_t pool;
10578     if (prefix == "osd crush weight-set rm") {
10579       string poolname;
10580       cmd_getval(cmdmap, "pool", poolname);
10581       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10582       if (pool < 0) {
10583         ss << "pool '" << poolname << "' not found";
10584         err = -ENOENT;
10585         goto reply;
10586       }
10587     } else {
10588       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10589     }
10590     newcrush.rm_choose_args(pool);
10591     pending_inc.crush.clear();
10592     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10593     goto update;
10594
10595   } else if (prefix == "osd crush weight-set reweight" ||
10596              prefix == "osd crush weight-set reweight-compat") {
10597     string poolname, item;
10598     vector<double> weight;
10599     cmd_getval(cmdmap, "pool", poolname);
10600     cmd_getval(cmdmap, "item", item);
10601     cmd_getval(cmdmap, "weight", weight);
10602     CrushWrapper newcrush = _get_pending_crush();
10603     int64_t pool;
10604     if (prefix == "osd crush weight-set reweight") {
10605       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10606       if (pool < 0) {
10607         ss << "pool '" << poolname << "' not found";
10608         err = -ENOENT;
10609         goto reply;
10610       }
10611       if (!newcrush.have_choose_args(pool)) {
10612         ss << "no weight-set for pool '" << poolname << "'";
10613         err = -ENOENT;
10614         goto reply;
10615       }
10616       auto arg_map = newcrush.choose_args_get(pool);
10617       int positions = newcrush.get_choose_args_positions(arg_map);
10618       if (weight.size() != (size_t)positions) {
10619          ss << "must specify exact " << positions << " weight values";
10620          err = -EINVAL;
10621          goto reply;
10622       }
10623     } else {
10624       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10625       if (!newcrush.have_choose_args(pool)) {
10626         ss << "no backward-compatible weight-set";
10627         err = -ENOENT;
10628         goto reply;
10629       }
10630     }
10631     if (!newcrush.name_exists(item)) {
10632       ss << "item '" << item << "' does not exist";
10633       err = -ENOENT;
10634       goto reply;
10635     }
10636     err = newcrush.choose_args_adjust_item_weightf(
10637       cct,
10638       newcrush.choose_args_get(pool),
10639       newcrush.get_item_id(item),
10640       weight,
10641       &ss);
10642     if (err < 0) {
10643       goto reply;
10644     }
10645     err = 0;
10646     pending_inc.crush.clear();
10647     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10648     goto update;
10649   } else if (osdid_present &&
10650              (prefix == "osd crush set" || prefix == "osd crush add")) {
10651     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10652     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10653     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10654
10655     if (!osdmap.exists(osdid)) {
10656       err = -ENOENT;
10657       ss << osd_name
10658          << " does not exist. Create it before updating the crush map";
10659       goto reply;
10660     }
10661
10662     double weight;
10663     if (!cmd_getval(cmdmap, "weight", weight)) {
10664       ss << "unable to parse weight value '"
10665          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10666       err = -EINVAL;
10667       goto reply;
10668     }
10669
10670     string args;
10671     vector<string> argvec;
10672     cmd_getval(cmdmap, "args", argvec);
10673     map<string,string> loc;
10674     CrushWrapper::parse_loc_map(argvec, &loc);
10675
10676     if (prefix == "osd crush set"
10677         && !_get_stable_crush().item_exists(osdid)) {
10678       err = -ENOENT;
10679       ss << "unable to set item id " << osdid << " name '" << osd_name
10680          << "' weight " << weight << " at location " << loc
10681          << ": does not exist";
10682       goto reply;
10683     }
10684
10685     dout(5) << "adding/updating crush item id " << osdid << " name '"
10686       << osd_name << "' weight " << weight << " at location "
10687       << loc << dendl;
10688     CrushWrapper newcrush = _get_pending_crush();
10689
10690     string action;
10691     if (prefix == "osd crush set" ||
10692         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10693       action = "set";
10694       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10695     } else {
10696       action = "add";
10697       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10698       if (err == 0)
10699         err = 1;
10700     }
10701
10702     if (err < 0)
10703       goto reply;
10704
10705     if (err == 0 && !_have_pending_crush()) {
10706       ss << action << " item id " << osdid << " name '" << osd_name
10707          << "' weight " << weight << " at location " << loc << ": no change";
10708       goto reply;
10709     }
10710
10711     pending_inc.crush.clear();
10712     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10713     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10714        << weight << " at location " << loc << " to crush map";
10715     getline(ss, rs);
10716     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10717                                                       get_last_committed() + 1));
10718     return true;
10719
10720   } else if (prefix == "osd crush create-or-move") {
10721     do {
10722       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10723       if (!osdmap.exists(osdid)) {
10724         err = -ENOENT;
10725         ss << osd_name
10726            << " does not exist.  create it before updating the crush map";
10727         goto reply;
10728       }
10729
10730       double weight;
10731       if (!cmd_getval(cmdmap, "weight", weight)) {
10732         ss << "unable to parse weight value '"
10733            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10734         err = -EINVAL;
10735         goto reply;
10736       }
10737
10738       string args;
10739       vector<string> argvec;
10740       cmd_getval(cmdmap, "args", argvec);
10741       map<string,string> loc;
10742       CrushWrapper::parse_loc_map(argvec, &loc);
10743
10744       dout(0) << "create-or-move crush item name '" << osd_name
10745               << "' initial_weight " << weight << " at location " << loc
10746               << dendl;
10747
10748       CrushWrapper newcrush = _get_pending_crush();
10749
10750       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10751                                          g_conf()->osd_crush_update_weight_set);
10752       if (err == 0) {
10753         ss << "create-or-move updated item name '" << osd_name
10754            << "' weight " << weight
10755            << " at location " << loc << " to crush map";
10756         break;
10757       }
10758       if (err > 0) {
10759         pending_inc.crush.clear();
10760         newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10761         ss << "create-or-move updating item name '" << osd_name
10762            << "' weight " << weight
10763            << " at location " << loc << " to crush map";
10764         getline(ss, rs);
10765         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10766                                                   get_last_committed() + 1));
10767         return true;
10768       }
10769     } while (false);
10770
10771   } else if (prefix == "osd crush move") {
10772     do {
10773       // osd crush move <name> <loc1> [<loc2> ...]
10774       string name;
10775       vector<string> argvec;
10776       cmd_getval(cmdmap, "name", name);
10777       cmd_getval(cmdmap, "args", argvec);
10778       map<string,string> loc;
10779       CrushWrapper::parse_loc_map(argvec, &loc);
10780
10781       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10782       CrushWrapper newcrush = _get_pending_crush();
10783
10784       if (!newcrush.name_exists(name)) {
10785         err = -ENOENT;
10786         ss << "item " << name << " does not exist";
10787         break;
10788       }
10789       int id = newcrush.get_item_id(name);
10790
10791       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10792         if (id >= 0) {
10793           err = newcrush.create_or_move_item(
10794             cct, id, 0, name, loc,
10795             g_conf()->osd_crush_update_weight_set);
10796         } else {
10797           err = newcrush.move_bucket(cct, id, loc);
10798         }
10799         if (err >= 0) {
10800           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10801           pending_inc.crush.clear();
10802           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10803           getline(ss, rs);
10804           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10805                                                    get_last_committed() + 1));
10806           return true;
10807         }
10808       } else {
10809         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10810         err = 0;
10811       }
10812     } while (false);
10813   } else if (prefix == "osd crush swap-bucket") {
10814     string source, dest;
10815     cmd_getval(cmdmap, "source", source);
10816     cmd_getval(cmdmap, "dest", dest);
10817
10818     bool force = false;
10819     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10820
10821     CrushWrapper newcrush = _get_pending_crush();
10822     if (!newcrush.name_exists(source)) {
10823       ss << "source item " << source << " does not exist";
10824       err = -ENOENT;
10825       goto reply;
10826     }
10827     if (!newcrush.name_exists(dest)) {
10828       ss << "dest item " << dest << " does not exist";
10829       err = -ENOENT;
10830       goto reply;
10831     }
10832     int sid = newcrush.get_item_id(source);
10833     int did = newcrush.get_item_id(dest);
10834     int sparent;
10835     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10836       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10837       err = -EPERM;
10838       goto reply;
10839     }
10840     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10841         !force) {
10842       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10843          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10844          << "; pass --yes-i-really-mean-it to proceed anyway";
10845       err = -EPERM;
10846       goto reply;
10847     }
10848     int r = newcrush.swap_bucket(cct, sid, did);
10849     if (r < 0) {
10850       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10851       err = r;
10852       goto reply;
10853     }
10854     ss << "swapped bucket of " << source << " to " << dest;
10855     pending_inc.crush.clear();
10856     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10857     wait_for_finished_proposal(op,
10858                                new Monitor::C_Command(mon, op, err, ss.str(),
10859                                                       get_last_committed() + 1));
10860     return true;
10861   } else if (prefix == "osd crush link") {
10862     // osd crush link <name> <loc1> [<loc2> ...]
10863     string name;
10864     cmd_getval(cmdmap, "name", name);
10865     vector<string> argvec;
10866     cmd_getval(cmdmap, "args", argvec);
10867     map<string,string> loc;
10868     CrushWrapper::parse_loc_map(argvec, &loc);
10869
10870     // Need an explicit check for name_exists because get_item_id returns
10871     // 0 on unfound.
10872     int id = osdmap.crush->get_item_id(name);
10873     if (!osdmap.crush->name_exists(name)) {
10874       err = -ENOENT;
10875       ss << "item " << name << " does not exist";
10876       goto reply;
10877     } else {
10878       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10879     }
10880     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10881       ss << "no need to move item id " << id << " name '" << name
10882          << "' to location " << loc << " in crush map";
10883       err = 0;
10884       goto reply;
10885     }
10886
10887     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10888     CrushWrapper newcrush = _get_pending_crush();
10889
10890     if (!newcrush.name_exists(name)) {
10891       err = -ENOENT;
10892       ss << "item " << name << " does not exist";
10893       goto reply;
10894     } else {
10895       int id = newcrush.get_item_id(name);
10896       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10897         err = newcrush.link_bucket(cct, id, loc);
10898         if (err >= 0) {
10899           ss << "linked item id " << id << " name '" << name
10900              << "' to location " << loc << " in crush map";
10901           pending_inc.crush.clear();
10902           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10903         } else {
10904           ss << "cannot link item id " << id << " name '" << name
10905              << "' to location " << loc;
10906           goto reply;
10907         }
10908       } else {
10909         ss << "no need to move item id " << id << " name '" << name
10910            << "' to location " << loc << " in crush map";
10911         err = 0;
10912       }
10913     }
10914     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10915                                               get_last_committed() + 1));
10916     return true;
10917   } else if (prefix == "osd crush rm" ||
10918              prefix == "osd crush remove" ||
10919              prefix == "osd crush unlink") {
10920     do {
10921       // osd crush rm <id> [ancestor]
10922       CrushWrapper newcrush = _get_pending_crush();
10923
10924       string name;
10925       cmd_getval(cmdmap, "name", name);
10926
10927       if (!osdmap.crush->name_exists(name)) {
10928         err = 0;
10929         ss << "device '" << name << "' does not appear in the crush map";
10930         break;
10931       }
10932       if (!newcrush.name_exists(name)) {
10933         err = 0;
10934         ss << "device '" << name << "' does not appear in the crush map";
10935         getline(ss, rs);
10936         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10937                                                   get_last_committed() + 1));
10938         return true;
10939       }
10940       int id = newcrush.get_item_id(name);
10941       int ancestor = 0;
10942
10943       bool unlink_only = prefix == "osd crush unlink";
10944       string ancestor_str;
10945       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10946         if (!newcrush.name_exists(ancestor_str)) {
10947           err = -ENOENT;
10948           ss << "ancestor item '" << ancestor_str
10949              << "' does not appear in the crush map";
10950           break;
10951         }
10952         ancestor = newcrush.get_item_id(ancestor_str);
10953       }
10954
10955       err = prepare_command_osd_crush_remove(
10956           newcrush,
10957           id, ancestor,
10958           (ancestor < 0), unlink_only);
10959
10960       if (err == -ENOENT) {
10961         ss << "item " << id << " does not appear in that position";
10962         err = 0;
10963         break;
10964       }
10965       if (err == 0) {
10966         if (!unlink_only)
10967           pending_inc.new_crush_node_flags[id] = 0;
10968         ss << "removed item id " << id << " name '" << name << "' from crush map";
10969         getline(ss, rs);
10970         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10971                                                   get_last_committed() + 1));
10972         return true;
10973       }
10974     } while (false);
10975
10976   } else if (prefix == "osd crush reweight-all") {
10977     CrushWrapper newcrush = _get_pending_crush();
10978
10979     newcrush.reweight(cct);
10980     pending_inc.crush.clear();
10981     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10982     ss << "reweighted crush hierarchy";
10983     getline(ss, rs);
10984     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10985                                                   get_last_committed() + 1));
10986     return true;
10987   } else if (prefix == "osd crush reweight") {
10988     // osd crush reweight <name> <weight>
10989     CrushWrapper newcrush = _get_pending_crush();
10990
10991     string name;
10992     cmd_getval(cmdmap, "name", name);
10993     if (!newcrush.name_exists(name)) {
10994       err = -ENOENT;
10995       ss << "device '" << name << "' does not appear in the crush map";
10996       goto reply;
10997     }
10998
10999     int id = newcrush.get_item_id(name);
11000     if (id < 0) {
11001       ss << "device '" << name << "' is not a leaf in the crush map";
11002       err = -EINVAL;
11003       goto reply;
11004     }
11005     double w;
11006     if (!cmd_getval(cmdmap, "weight", w)) {
11007       ss << "unable to parse weight value '"
11008          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11009       err = -EINVAL;
11010       goto reply;
11011     }
11012
11013     err = newcrush.adjust_item_weightf(cct, id, w,
11014                                        g_conf()->osd_crush_update_weight_set);
11015     if (err < 0)
11016       goto reply;
11017     pending_inc.crush.clear();
11018     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11019     ss << "reweighted item id " << id << " name '" << name << "' to " << w
11020        << " in crush map";
11021     getline(ss, rs);
11022     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11023                                                   get_last_committed() + 1));
11024     return true;
11025   } else if (prefix == "osd crush reweight-subtree") {
11026     // osd crush reweight <name> <weight>
11027     CrushWrapper newcrush = _get_pending_crush();
11028
11029     string name;
11030     cmd_getval(cmdmap, "name", name);
11031     if (!newcrush.name_exists(name)) {
11032       err = -ENOENT;
11033       ss << "device '" << name << "' does not appear in the crush map";
11034       goto reply;
11035     }
11036
11037     int id = newcrush.get_item_id(name);
11038     if (id >= 0) {
11039       ss << "device '" << name << "' is not a subtree in the crush map";
11040       err = -EINVAL;
11041       goto reply;
11042     }
11043     double w;
11044     if (!cmd_getval(cmdmap, "weight", w)) {
11045       ss << "unable to parse weight value '"
11046          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11047       err = -EINVAL;
11048       goto reply;
11049     }
11050
11051     err = newcrush.adjust_subtree_weightf(cct, id, w,
11052                                           g_conf()->osd_crush_update_weight_set);
11053     if (err < 0)
11054       goto reply;
11055     pending_inc.crush.clear();
11056     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11057     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
11058        << " in crush map";
11059     getline(ss, rs);
11060     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11061                                               get_last_committed() + 1));
11062     return true;
11063   } else if (prefix == "osd crush tunables") {
11064     CrushWrapper newcrush = _get_pending_crush();
11065
11066     err = 0;
11067     string profile;
11068     cmd_getval(cmdmap, "profile", profile);
11069     if (profile == "legacy" || profile == "argonaut") {
11070       newcrush.set_tunables_legacy();
11071     } else if (profile == "bobtail") {
11072       newcrush.set_tunables_bobtail();
11073     } else if (profile == "firefly") {
11074       newcrush.set_tunables_firefly();
11075     } else if (profile == "hammer") {
11076       newcrush.set_tunables_hammer();
11077     } else if (profile == "jewel") {
11078       newcrush.set_tunables_jewel();
11079     } else if (profile == "optimal") {
11080       newcrush.set_tunables_optimal();
11081     } else if (profile == "default") {
11082       newcrush.set_tunables_default();
11083     } else {
11084       ss << "unrecognized profile '" << profile << "'";
11085       err = -EINVAL;
11086       goto reply;
11087     }
11088
11089     if (!validate_crush_against_features(&newcrush, ss)) {
11090       err = -EINVAL;
11091       goto reply;
11092     }
11093
11094     pending_inc.crush.clear();
11095     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11096     ss << "adjusted tunables profile to " << profile;
11097     getline(ss, rs);
11098     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099                                               get_last_committed() + 1));
11100     return true;
11101   } else if (prefix == "osd crush set-tunable") {
11102     CrushWrapper newcrush = _get_pending_crush();
11103
11104     err = 0;
11105     string tunable;
11106     cmd_getval(cmdmap, "tunable", tunable);
11107
11108     int64_t value = -1;
11109     if (!cmd_getval(cmdmap, "value", value)) {
11110       err = -EINVAL;
11111       ss << "failed to parse integer value "
11112          << cmd_vartype_stringify(cmdmap.at("value"));
11113       goto reply;
11114     }
11115
11116     if (tunable == "straw_calc_version") {
11117       if (value != 0 && value != 1) {
11118         ss << "value must be 0 or 1; got " << value;
11119         err = -EINVAL;
11120         goto reply;
11121       }
11122       newcrush.set_straw_calc_version(value);
11123     } else {
11124       ss << "unrecognized tunable '" << tunable << "'";
11125       err = -EINVAL;
11126       goto reply;
11127     }
11128
11129     if (!validate_crush_against_features(&newcrush, ss)) {
11130       err = -EINVAL;
11131       goto reply;
11132     }
11133
11134     pending_inc.crush.clear();
11135     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11136     ss << "adjusted tunable " << tunable << " to " << value;
11137     getline(ss, rs);
11138     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11139                                               get_last_committed() + 1));
11140     return true;
11141
11142   } else if (prefix == "osd crush rule create-simple") {
11143     string name, root, type, mode;
11144     cmd_getval(cmdmap, "name", name);
11145     cmd_getval(cmdmap, "root", root);
11146     cmd_getval(cmdmap, "type", type);
11147     cmd_getval(cmdmap, "mode", mode);
11148     if (mode == "")
11149       mode = "firstn";
11150
11151     if (osdmap.crush->rule_exists(name)) {
11152       // The name is uniquely associated to a ruleid and the rule it contains
11153       // From the user point of view, the rule is more meaningfull.
11154       ss << "rule " << name << " already exists";
11155       err = 0;
11156       goto reply;
11157     }
11158
11159     CrushWrapper newcrush = _get_pending_crush();
11160
11161     if (newcrush.rule_exists(name)) {
11162       // The name is uniquely associated to a ruleid and the rule it contains
11163       // From the user point of view, the rule is more meaningfull.
11164       ss << "rule " << name << " already exists";
11165       err = 0;
11166     } else {
11167       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11168                                                pg_pool_t::TYPE_REPLICATED, &ss);
11169       if (ruleno < 0) {
11170         err = ruleno;
11171         goto reply;
11172       }
11173
11174       pending_inc.crush.clear();
11175       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11176     }
11177     getline(ss, rs);
11178     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11179                                               get_last_committed() + 1));
11180     return true;
11181
11182   } else if (prefix == "osd crush rule create-replicated") {
11183     string name, root, type, device_class;
11184     cmd_getval(cmdmap, "name", name);
11185     cmd_getval(cmdmap, "root", root);
11186     cmd_getval(cmdmap, "type", type);
11187     cmd_getval(cmdmap, "class", device_class);
11188
11189     if (osdmap.crush->rule_exists(name)) {
11190       // The name is uniquely associated to a ruleid and the rule it contains
11191       // From the user point of view, the rule is more meaningfull.
11192       ss << "rule " << name << " already exists";
11193       err = 0;
11194       goto reply;
11195     }
11196
11197     CrushWrapper newcrush = _get_pending_crush();
11198
11199     if (newcrush.rule_exists(name)) {
11200       // The name is uniquely associated to a ruleid and the rule it contains
11201       // From the user point of view, the rule is more meaningfull.
11202       ss << "rule " << name << " already exists";
11203       err = 0;
11204     } else {
11205       int ruleno = newcrush.add_simple_rule(
11206         name, root, type, device_class,
11207         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11208       if (ruleno < 0) {
11209         err = ruleno;
11210         goto reply;
11211       }
11212
11213       pending_inc.crush.clear();
11214       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11215     }
11216     getline(ss, rs);
11217     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11218                                               get_last_committed() + 1));
11219     return true;
11220
11221   } else if (prefix == "osd erasure-code-profile rm") {
11222     string name;
11223     cmd_getval(cmdmap, "name", name);
11224
11225     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11226       goto wait;
11227
11228     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11229       err = -EBUSY;
11230       goto reply;
11231     }
11232
11233     if (osdmap.has_erasure_code_profile(name) ||
11234         pending_inc.new_erasure_code_profiles.count(name)) {
11235       if (osdmap.has_erasure_code_profile(name)) {
11236         pending_inc.old_erasure_code_profiles.push_back(name);
11237       } else {
11238         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11239         pending_inc.new_erasure_code_profiles.erase(name);
11240       }
11241
11242       getline(ss, rs);
11243       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11244                                                         get_last_committed() + 1));
11245       return true;
11246     } else {
11247       ss << "erasure-code-profile " << name << " does not exist";
11248       err = 0;
11249       goto reply;
11250     }
11251
11252   } else if (prefix == "osd erasure-code-profile set") {
11253     string name;
11254     cmd_getval(cmdmap, "name", name);
11255     vector<string> profile;
11256     cmd_getval(cmdmap, "profile", profile);
11257
11258     bool force = false;
11259     cmd_getval(cmdmap, "force", force);
11260
11261     map<string,string> profile_map;
11262     err = parse_erasure_code_profile(profile, &profile_map, &ss);
11263     if (err)
11264       goto reply;
11265     if (auto found = profile_map.find("crush-failure-domain");
11266         found != profile_map.end()) {
11267       const auto& failure_domain = found->second;
11268       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11269       if (failure_domain_type < 0) {
11270         ss << "erasure-code-profile " << profile_map
11271           << " contains an invalid failure-domain " << std::quoted(failure_domain);
11272         err = -EINVAL;
11273         goto reply;
11274       }
11275     }
11276
11277     if (profile_map.find("plugin") == profile_map.end()) {
11278       ss << "erasure-code-profile " << profile_map
11279          << " must contain a plugin entry" << std::endl;
11280       err = -EINVAL;
11281       goto reply;
11282     }
11283     string plugin = profile_map["plugin"];
11284
11285     if (pending_inc.has_erasure_code_profile(name)) {
11286       dout(20) << "erasure code profile " << name << " try again" << dendl;
11287       goto wait;
11288     } else {
11289       err = normalize_profile(name, profile_map, force, &ss);
11290       if (err)
11291         goto reply;
11292
11293       if (osdmap.has_erasure_code_profile(name)) {
11294         ErasureCodeProfile existing_profile_map =
11295           osdmap.get_erasure_code_profile(name);
11296         err = normalize_profile(name, existing_profile_map, force, &ss);
11297         if (err)
11298           goto reply;
11299
11300         if (existing_profile_map == profile_map) {
11301           err = 0;
11302           goto reply;
11303         }
11304         if (!force) {
11305           err = -EPERM;
11306           ss << "will not override erasure code profile " << name
11307              << " because the existing profile "
11308              << existing_profile_map
11309              << " is different from the proposed profile "
11310              << profile_map;
11311           goto reply;
11312         }
11313       }
11314
11315       dout(20) << "erasure code profile set " << name << "="
11316                << profile_map << dendl;
11317       pending_inc.set_erasure_code_profile(name, profile_map);
11318     }
11319
11320     getline(ss, rs);
11321     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11322                                                       get_last_committed() + 1));
11323     return true;
11324
11325   } else if (prefix == "osd crush rule create-erasure") {
11326     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11327     if (err == -EAGAIN)
11328       goto wait;
11329     if (err)
11330       goto reply;
11331     string name, poolstr;
11332     cmd_getval(cmdmap, "name", name);
11333     string profile;
11334     cmd_getval(cmdmap, "profile", profile);
11335     if (profile == "")
11336       profile = "default";
11337     if (profile == "default") {
11338       if (!osdmap.has_erasure_code_profile(profile)) {
11339         if (pending_inc.has_erasure_code_profile(profile)) {
11340           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11341           goto wait;
11342         }
11343
11344         map<string,string> profile_map;
11345         err = osdmap.get_erasure_code_profile_default(cct,
11346                                                       profile_map,
11347                                                       &ss);
11348         if (err)
11349           goto reply;
11350         err = normalize_profile(name, profile_map, true, &ss);
11351         if (err)
11352           goto reply;
11353         dout(20) << "erasure code profile set " << profile << "="
11354                  << profile_map << dendl;
11355         pending_inc.set_erasure_code_profile(profile, profile_map);
11356         goto wait;
11357       }
11358     }
11359
11360     int rule;
11361     err = crush_rule_create_erasure(name, profile, &rule, &ss);
11362     if (err < 0) {
11363       switch(err) {
11364       case -EEXIST: // return immediately
11365         ss << "rule " << name << " already exists";
11366         err = 0;
11367         goto reply;
11368         break;
11369       case -EALREADY: // wait for pending to be proposed
11370         ss << "rule " << name << " already exists";
11371         err = 0;
11372         break;
11373       default: // non recoverable error
11374         goto reply;
11375         break;
11376       }
11377     } else {
11378       ss << "created rule " << name << " at " << rule;
11379     }
11380
11381     getline(ss, rs);
11382     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11383                                                       get_last_committed() + 1));
11384     return true;
11385
11386   } else if (prefix == "osd crush rule rm") {
11387     string name;
11388     cmd_getval(cmdmap, "name", name);
11389
11390     if (!osdmap.crush->rule_exists(name)) {
11391       ss << "rule " << name << " does not exist";
11392       err = 0;
11393       goto reply;
11394     }
11395
11396     CrushWrapper newcrush = _get_pending_crush();
11397
11398     if (!newcrush.rule_exists(name)) {
11399       ss << "rule " << name << " does not exist";
11400       err = 0;
11401     } else {
11402       int ruleno = newcrush.get_rule_id(name);
11403       ceph_assert(ruleno >= 0);
11404
11405       // make sure it is not in use.
11406       // FIXME: this is ok in some situations, but let's not bother with that
11407       // complexity now.
11408       if (osdmap.crush_rule_in_use(ruleno)) {
11409         ss << "crush rule " << name << " (" << ruleno << ") is in use";
11410         err = -EBUSY;
11411         goto reply;
11412       }
11413
11414       err = newcrush.remove_rule(ruleno);
11415       if (err < 0) {
11416         goto reply;
11417       }
11418
11419       pending_inc.crush.clear();
11420       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11421     }
11422     getline(ss, rs);
11423     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11424                                               get_last_committed() + 1));
11425     return true;
11426
11427   } else if (prefix == "osd crush rule rename") {
11428     string srcname;
11429     string dstname;
11430     cmd_getval(cmdmap, "srcname", srcname);
11431     cmd_getval(cmdmap, "dstname", dstname);
11432     if (srcname.empty() || dstname.empty()) {
11433       ss << "must specify both source rule name and destination rule name";
11434       err = -EINVAL;
11435       goto reply;
11436     }
11437     if (srcname == dstname) {
11438       ss << "destination rule name is equal to source rule name";
11439       err = 0;
11440       goto reply;
11441     }
11442
11443     CrushWrapper newcrush = _get_pending_crush();
11444     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11445       // srcname does not exist and dstname already exists
11446       // suppose this is a replay and return success
11447       // (so this command is idempotent)
11448       ss << "already renamed to '" << dstname << "'";
11449       err = 0;
11450       goto reply;
11451     }
11452
11453     err = newcrush.rename_rule(srcname, dstname, &ss);
11454     if (err < 0) {
11455       // ss has reason for failure
11456       goto reply;
11457     }
11458     pending_inc.crush.clear();
11459     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11460     getline(ss, rs);
11461     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11462                                get_last_committed() + 1));
11463     return true;
11464
11465   } else if (prefix == "osd setmaxosd") {
11466     int64_t newmax;
11467     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11468       ss << "unable to parse 'newmax' value '"
11469          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11470       err = -EINVAL;
11471       goto reply;
11472     }
11473
11474     if (newmax > g_conf()->mon_max_osd) {
11475       err = -ERANGE;
11476       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11477          << g_conf()->mon_max_osd << ")";
11478       goto reply;
11479     }
11480
11481     // Don't allow shrinking OSD number as this will cause data loss
11482     // and may cause kernel crashes.
11483     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11484     if (newmax < osdmap.get_max_osd()) {
11485       // Check if the OSDs exist between current max and new value.
11486       // If there are any OSDs exist, then don't allow shrinking number
11487       // of OSDs.
11488       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11489         if (osdmap.exists(i)) {
11490           err = -EBUSY;
11491           ss << "cannot shrink max_osd to " << newmax
11492              << " because osd." << i << " (and possibly others) still in use";
11493           goto reply;
11494         }
11495       }
11496     }
11497
11498     pending_inc.new_max_osd = newmax;
11499     ss << "set new max_osd = " << pending_inc.new_max_osd;
11500     getline(ss, rs);
11501     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11502                                               get_last_committed() + 1));
11503     return true;
11504
11505   } else if (prefix == "osd set-full-ratio" ||
11506              prefix == "osd set-backfillfull-ratio" ||
11507              prefix == "osd set-nearfull-ratio") {
11508     double n;
11509     if (!cmd_getval(cmdmap, "ratio", n)) {
11510       ss << "unable to parse 'ratio' value '"
11511          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11512       err = -EINVAL;
11513       goto reply;
11514     }
11515     if (prefix == "osd set-full-ratio")
11516       pending_inc.new_full_ratio = n;
11517     else if (prefix == "osd set-backfillfull-ratio")
11518       pending_inc.new_backfillfull_ratio = n;
11519     else if (prefix == "osd set-nearfull-ratio")
11520       pending_inc.new_nearfull_ratio = n;
11521     ss << prefix << " " << n;
11522     getline(ss, rs);
11523     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11524                                               get_last_committed() + 1));
11525     return true;
11526   } else if (prefix == "osd set-require-min-compat-client") {
11527     string v;
11528     cmd_getval(cmdmap, "version", v);
11529     ceph_release_t vno = ceph_release_from_name(v);
11530     if (!vno) {
11531       ss << "version " << v << " is not recognized";
11532       err = -EINVAL;
11533       goto reply;
11534     }
11535     OSDMap newmap;
11536     newmap.deepish_copy_from(osdmap);
11537     newmap.apply_incremental(pending_inc);
11538     newmap.require_min_compat_client = vno;
11539     auto mvno = newmap.get_min_compat_client();
11540     if (vno < mvno) {
11541       ss << "osdmap current utilizes features that require " << mvno
11542          << "; cannot set require_min_compat_client below that to " << vno;
11543       err = -EPERM;
11544       goto reply;
11545     }
11546     bool sure = false;
11547     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11548     if (!sure) {
11549       FeatureMap m;
11550       mon.get_combined_feature_map(&m);
11551       uint64_t features = ceph_release_features(to_integer<int>(vno));
11552       bool first = true;
11553       bool ok = true;
11554       for (int type : {
11555             CEPH_ENTITY_TYPE_CLIENT,
11556             CEPH_ENTITY_TYPE_MDS,
11557             CEPH_ENTITY_TYPE_MGR }) {
11558         auto p = m.m.find(type);
11559         if (p == m.m.end()) {
11560           continue;
11561         }
11562         for (auto& q : p->second) {
11563           uint64_t missing = ~q.first & features;
11564           if (missing) {
11565             if (first) {
11566               ss << "cannot set require_min_compat_client to " << v << ": ";
11567             } else {
11568               ss << "; ";
11569             }
11570             first = false;
11571             ss << q.second << " connected " << ceph_entity_type_name(type)
11572                << "(s) look like " << ceph_release_name(
11573                  ceph_release_from_features(q.first))
11574                << " (missing 0x" << std::hex << missing << std::dec << ")";
11575             ok = false;
11576           }
11577         }
11578       }
11579       if (!ok) {
11580         ss << "; add --yes-i-really-mean-it to do it anyway";
11581         err = -EPERM;
11582         goto reply;
11583       }
11584     }
11585     ss << "set require_min_compat_client to " << vno;
11586     pending_inc.new_require_min_compat_client = vno;
11587     getline(ss, rs);
11588     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11589                                                           get_last_committed() + 1));
11590     return true;
11591   } else if (prefix == "osd pause") {
11592     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11593
11594   } else if (prefix == "osd unpause") {
11595     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11596
11597   } else if (prefix == "osd set") {
11598     bool sure = false;
11599     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11600
11601     string key;
11602     cmd_getval(cmdmap, "key", key);
11603     if (key == "pause")
11604       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11605     else if (key == "noup")
11606       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11607     else if (key == "nodown")
11608       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11609     else if (key == "noout")
11610       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11611     else if (key == "noin")
11612       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11613     else if (key == "nobackfill")
11614       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11615     else if (key == "norebalance")
11616       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11617     else if (key == "norecover")
11618       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11619     else if (key == "noscrub")
11620       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11621     else if (key == "nodeep-scrub")
11622       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11623     else if (key == "notieragent")
11624       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11625     else if (key == "nosnaptrim")
11626       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11627     else if (key == "pglog_hardlimit") {
11628       if (!osdmap.get_num_up_osds() && !sure) {
11629         ss << "Not advisable to continue since no OSDs are up. Pass "
11630            << "--yes-i-really-mean-it if you really wish to continue.";
11631         err = -EPERM;
11632         goto reply;
11633       }
11634       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11635       // we are reusing a jewel feature bit that was retired in luminous.
11636       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11637          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11638           || sure)) {
11639         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11640       } else {
11641         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11642         err = -EPERM;
11643         goto reply;
11644       }
11645     } else {
11646       ss << "unrecognized flag '" << key << "'";
11647       err = -EINVAL;
11648     }
11649
11650   } else if (prefix == "osd unset") {
11651     string key;
11652     cmd_getval(cmdmap, "key", key);
11653     if (key == "pause")
11654       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11655     else if (key == "noup")
11656       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11657     else if (key == "nodown")
11658       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11659     else if (key == "noout")
11660       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11661     else if (key == "noin")
11662       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11663     else if (key == "nobackfill")
11664       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11665     else if (key == "norebalance")
11666       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11667     else if (key == "norecover")
11668       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11669     else if (key == "noscrub")
11670       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11671     else if (key == "nodeep-scrub")
11672       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11673     else if (key == "notieragent")
11674       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11675     else if (key == "nosnaptrim")
11676       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11677     else {
11678       ss << "unrecognized flag '" << key << "'";
11679       err = -EINVAL;
11680     }
11681
11682   } else if (prefix == "osd require-osd-release") {
11683     string release;
11684     cmd_getval(cmdmap, "release", release);
11685     bool sure = false;
11686     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11687     ceph_release_t rel = ceph_release_from_name(release.c_str());
11688     if (!rel) {
11689       ss << "unrecognized release " << release;
11690       err = -EINVAL;
11691       goto reply;
11692     }
11693     if (rel == osdmap.require_osd_release) {
11694       // idempotent
11695       err = 0;
11696       goto reply;
11697     }
11698     if (osdmap.require_osd_release < ceph_release_t::pacific && !sure) {
11699       ss << "Not advisable to continue since current 'require_osd_release' "
11700          << "refers to a very old Ceph release. Pass "
11701          << "--yes-i-really-mean-it if you really wish to continue.";
11702       err = -EPERM;
11703       goto reply;
11704     }
11705     if (!osdmap.get_num_up_osds() && !sure) {
11706       ss << "Not advisable to continue since no OSDs are up. Pass "
11707          << "--yes-i-really-mean-it if you really wish to continue.";
11708       err = -EPERM;
11709       goto reply;
11710     }
11711     if (rel == ceph_release_t::pacific) {
11712       if (!mon.monmap->get_required_features().contains_all(
11713             ceph::features::mon::FEATURE_PACIFIC)) {
11714         ss << "not all mons are pacific";
11715         err = -EPERM;
11716         goto reply;
11717       }
11718       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11719            && !sure) {
11720         ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11721         err = -EPERM;
11722         goto reply;
11723       }
11724     } else if (rel == ceph_release_t::quincy) {
11725       if (!mon.monmap->get_required_features().contains_all(
11726             ceph::features::mon::FEATURE_QUINCY)) {
11727         ss << "not all mons are quincy";
11728         err = -EPERM;
11729         goto reply;
11730       }
11731       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11732            && !sure) {
11733         ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11734         err = -EPERM;
11735         goto reply;
11736       }
11737     } else if (rel == ceph_release_t::reef) {
11738       if (!mon.monmap->get_required_features().contains_all(
11739             ceph::features::mon::FEATURE_REEF)) {
11740         ss << "not all mons are reef";
11741         err = -EPERM;
11742         goto reply;
11743       }
11744       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
11745            && !sure) {
11746         ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11747         err = -EPERM;
11748         goto reply;
11749       }
11750     } else {
11751       ss << "not supported for this release";
11752       err = -EPERM;
11753       goto reply;
11754     }
11755     if (rel < osdmap.require_osd_release) {
11756       ss << "require_osd_release cannot be lowered once it has been set";
11757       err = -EPERM;
11758       goto reply;
11759     }
11760     pending_inc.new_require_osd_release = rel;
11761     goto update;
11762   } else if (prefix == "osd down" ||
11763              prefix == "osd out" ||
11764              prefix == "osd in" ||
11765              prefix == "osd rm" ||
11766              prefix == "osd stop") {
11767
11768     bool any = false;
11769     bool stop = false;
11770     bool verbose = true;
11771     bool definitely_dead = false;
11772
11773     vector<string> idvec;
11774     cmd_getval(cmdmap, "ids", idvec);
11775     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11776     derr << "definitely_dead " << (int)definitely_dead << dendl;
11777     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11778       set<int> osds;
11779
11780       // wildcard?
11781       if (j == 0 &&
11782           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11783         if (prefix == "osd in") {
11784           // touch out osds only
11785           osdmap.get_out_existing_osds(osds);
11786         } else {
11787           osdmap.get_all_osds(osds);
11788         }
11789         stop = true;
11790         verbose = false; // so the output is less noisy.
11791       } else {
11792         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11793         if (osd < 0) {
11794           ss << "invalid osd id" << osd;
11795           err = -EINVAL;
11796           continue;
11797         } else if (!osdmap.exists(osd)) {
11798           ss << "osd." << osd << " does not exist. ";
11799           continue;
11800         }
11801
11802         osds.insert(osd);
11803       }
11804
11805       for (auto &osd : osds) {
11806         if (prefix == "osd down") {
11807           if (osdmap.is_down(osd)) {
11808             if (verbose)
11809               ss << "osd." << osd << " is already down. ";
11810           } else {
11811             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11812             ss << "marked down osd." << osd << ". ";
11813             any = true;
11814           }
11815           if (definitely_dead) {
11816             if (!pending_inc.new_xinfo.count(osd)) {
11817               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11818             }
11819             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11820               any = true;
11821             }
11822             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11823           }
11824         } else if (prefix == "osd out") {
11825           if (osdmap.is_out(osd)) {
11826             if (verbose)
11827               ss << "osd." << osd << " is already out. ";
11828           } else {
11829             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11830             if (osdmap.osd_weight[osd]) {
11831               if (pending_inc.new_xinfo.count(osd) == 0) {
11832                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11833               }
11834               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11835             }
11836             ss << "marked out osd." << osd << ". ";
11837             std::ostringstream msg;
11838             msg << "Client " << op->get_session()->entity_name
11839                 << " marked osd." << osd << " out";
11840             if (osdmap.is_up(osd)) {
11841               msg << ", while it was still marked up";
11842             } else {
11843               auto period = ceph_clock_now() - down_pending_out[osd];
11844               msg << ", after it was down for " << int(period.sec())
11845                   << " seconds";
11846             }
11847
11848             mon.clog->info() << msg.str();
11849             any = true;
11850           }
11851         } else if (prefix == "osd in") {
11852           if (osdmap.is_in(osd)) {
11853             if (verbose)
11854               ss << "osd." << osd << " is already in. ";
11855           } else {
11856             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11857               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11858               if (pending_inc.new_xinfo.count(osd) == 0) {
11859                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11860               }
11861               pending_inc.new_xinfo[osd].old_weight = 0;
11862             } else {
11863               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11864             }
11865             ss << "marked in osd." << osd << ". ";
11866             any = true;
11867           }
11868         } else if (prefix == "osd rm") {
11869           err = prepare_command_osd_remove(osd);
11870
11871           if (err == -EBUSY) {
11872             if (any)
11873               ss << ", ";
11874             ss << "osd." << osd << " is still up; must be down before removal. ";
11875           } else {
11876             ceph_assert(err == 0);
11877             if (any) {
11878               ss << ", osd." << osd;
11879             } else {
11880               ss << "removed osd." << osd;
11881             }
11882             any = true;
11883           }
11884         } else if (prefix == "osd stop") {
11885           if (osdmap.is_stop(osd)) {
11886             if (verbose)
11887               ss << "osd." << osd << " is already stopped. ";
11888           } else if (osdmap.is_down(osd)) {
11889             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11890             ss << "stop down osd." << osd << ". ";
11891             any = true;
11892           } else {
11893             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11894             ss << "stop osd." << osd << ". ";
11895             any = true;
11896           }
11897         }
11898       }
11899     }
11900     if (any) {
11901       getline(ss, rs);
11902       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11903                                                 get_last_committed() + 1));
11904       return true;
11905     }
11906   } else if (prefix == "osd set-group" ||
11907              prefix == "osd unset-group" ||
11908              prefix == "osd add-noup" ||
11909              prefix == "osd add-nodown" ||
11910              prefix == "osd add-noin" ||
11911              prefix == "osd add-noout" ||
11912              prefix == "osd rm-noup" ||
11913              prefix == "osd rm-nodown" ||
11914              prefix == "osd rm-noin" ||
11915              prefix == "osd rm-noout") {
11916     bool do_set = prefix == "osd set-group" ||
11917                   prefix.find("add") != string::npos;
11918     string flag_str;
11919     unsigned flags = 0;
11920     vector<string> who;
11921     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11922       cmd_getval(cmdmap, "flags", flag_str);
11923       cmd_getval(cmdmap, "who", who);
11924       vector<string> raw_flags;
11925       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11926       for (auto& f : raw_flags) {
11927         if (f == "noup")
11928           flags |= CEPH_OSD_NOUP;
11929         else if (f == "nodown")
11930           flags |= CEPH_OSD_NODOWN;
11931         else if (f == "noin")
11932           flags |= CEPH_OSD_NOIN;
11933         else if (f == "noout")
11934           flags |= CEPH_OSD_NOOUT;
11935         else {
11936           ss << "unrecognized flag '" << f << "', must be one of "
11937              << "{noup,nodown,noin,noout}";
11938           err = -EINVAL;
11939           goto reply;
11940         }
11941       }
11942     } else {
11943       cmd_getval(cmdmap, "ids", who);
11944       if (prefix.find("noup") != string::npos)
11945         flags = CEPH_OSD_NOUP;
11946       else if (prefix.find("nodown") != string::npos)
11947         flags = CEPH_OSD_NODOWN;
11948       else if (prefix.find("noin") != string::npos)
11949         flags = CEPH_OSD_NOIN;
11950       else if (prefix.find("noout") != string::npos)
11951         flags = CEPH_OSD_NOOUT;
11952       else
11953         ceph_assert(0 == "Unreachable!");
11954     }
11955     if (flags == 0) {
11956       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11957       err = -EINVAL;
11958       goto reply;
11959     }
11960     if (who.empty()) {
11961       ss << "must specify at least one or more targets to set/unset";
11962       err = -EINVAL;
11963       goto reply;
11964     }
11965     set<int> osds;
11966     set<int> crush_nodes;
11967     set<int> device_classes;
11968     for (auto& w : who) {
11969       if (w == "any" || w == "all" || w == "*") {
11970         osdmap.get_all_osds(osds);
11971         break;
11972       }
11973       std::stringstream ts;
11974       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11975         osds.insert(osd);
11976       } else if (osdmap.crush->name_exists(w)) {
11977         crush_nodes.insert(osdmap.crush->get_item_id(w));
11978       } else if (osdmap.crush->class_exists(w)) {
11979         device_classes.insert(osdmap.crush->get_class_id(w));
11980       } else {
11981         ss << "unable to parse osd id or crush node or device class: "
11982            << "\"" << w << "\". ";
11983       }
11984     }
11985     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11986       // ss has reason for failure
11987       err = -EINVAL;
11988       goto reply;
11989     }
11990     bool any = false;
11991     for (auto osd : osds) {
11992       if (!osdmap.exists(osd)) {
11993         ss << "osd." << osd << " does not exist. ";
11994         continue;
11995       }
11996       if (do_set) {
11997         if (flags & CEPH_OSD_NOUP) {
11998           any |= osdmap.is_noup_by_osd(osd) ?
11999             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
12000             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
12001         }
12002         if (flags & CEPH_OSD_NODOWN) {
12003           any |= osdmap.is_nodown_by_osd(osd) ?
12004             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
12005             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
12006         }
12007         if (flags & CEPH_OSD_NOIN) {
12008           any |= osdmap.is_noin_by_osd(osd) ?
12009             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
12010             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
12011         }
12012         if (flags & CEPH_OSD_NOOUT) {
12013           any |= osdmap.is_noout_by_osd(osd) ?
12014             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
12015             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
12016         }
12017       } else {
12018         if (flags & CEPH_OSD_NOUP) {
12019           any |= osdmap.is_noup_by_osd(osd) ?
12020             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
12021             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
12022         }
12023         if (flags & CEPH_OSD_NODOWN) {
12024           any |= osdmap.is_nodown_by_osd(osd) ?
12025             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
12026             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
12027         }
12028         if (flags & CEPH_OSD_NOIN) {
12029           any |= osdmap.is_noin_by_osd(osd) ?
12030             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
12031             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
12032         }
12033         if (flags & CEPH_OSD_NOOUT) {
12034           any |= osdmap.is_noout_by_osd(osd) ?
12035             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
12036             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
12037         }
12038       }
12039     }
12040     for (auto& id : crush_nodes) {
12041       auto old_flags = osdmap.get_crush_node_flags(id);
12042       auto& pending_flags = pending_inc.new_crush_node_flags[id];
12043       pending_flags |= old_flags; // adopt existing flags first!
12044       if (do_set) {
12045         pending_flags |= flags;
12046       } else {
12047         pending_flags &= ~flags;
12048       }
12049       any = true;
12050     }
12051     for (auto& id : device_classes) {
12052       auto old_flags = osdmap.get_device_class_flags(id);
12053       auto& pending_flags = pending_inc.new_device_class_flags[id];
12054       pending_flags |= old_flags;
12055       if (do_set) {
12056         pending_flags |= flags;
12057       } else {
12058         pending_flags &= ~flags;
12059       }
12060       any = true;
12061     }
12062     if (any) {
12063       getline(ss, rs);
12064       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
12065                                  get_last_committed() + 1));
12066       return true;
12067     }
12068   } else if (prefix == "osd pg-temp") {
12069     pg_t pgid;
12070     err = parse_pgid(cmdmap, ss, pgid);
12071     if (err < 0)
12072       goto reply;
12073     if (pending_inc.new_pg_temp.count(pgid)) {
12074       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12075       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12076       return true;
12077     }
12078
12079     vector<int64_t> id_vec;
12080     vector<int32_t> new_pg_temp;
12081     cmd_getval(cmdmap, "id", id_vec);
12082     if (id_vec.empty())  {
12083       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12084       ss << "done cleaning up pg_temp of " << pgid;
12085       goto update;
12086     }
12087     for (auto osd : id_vec) {
12088       if (!osdmap.exists(osd)) {
12089         ss << "osd." << osd << " does not exist";
12090         err = -ENOENT;
12091         goto reply;
12092       }
12093       new_pg_temp.push_back(osd);
12094     }
12095
12096     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12097     if ((int)new_pg_temp.size() < pool_min_size) {
12098       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12099          << pool_min_size << ")";
12100       err = -EINVAL;
12101       goto reply;
12102     }
12103
12104     int pool_size = osdmap.get_pg_pool_size(pgid);
12105     if ((int)new_pg_temp.size() > pool_size) {
12106       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12107          << pool_size << ")";
12108       err = -EINVAL;
12109       goto reply;
12110     }
12111
12112     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12113       new_pg_temp.begin(), new_pg_temp.end());
12114     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12115     goto update;
12116   } else if (prefix == "osd primary-temp" ||
12117              prefix == "osd rm-primary-temp") {
12118     pg_t pgid;
12119     err = parse_pgid(cmdmap, ss, pgid);
12120     if (err < 0)
12121       goto reply;
12122
12123     int64_t osd;
12124     if (prefix == "osd primary-temp") {
12125       if (!cmd_getval(cmdmap, "id", osd)) {
12126         ss << "unable to parse 'id' value '"
12127            << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12128         err = -EINVAL;
12129         goto reply;
12130       }
12131       if (!osdmap.exists(osd)) {
12132         ss << "osd." << osd << " does not exist";
12133         err = -ENOENT;
12134         goto reply;
12135       }
12136     }
12137     else if (prefix == "osd rm-primary-temp") {
12138       osd = -1;
12139     }
12140     else {
12141       ceph_assert(0 == "Unreachable!");
12142     }
12143
12144     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12145         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12146       ss << "require_min_compat_client "
12147          << osdmap.require_min_compat_client
12148          << " < firefly, which is required for primary-temp";
12149       err = -EPERM;
12150       goto reply;
12151     }
12152
12153     pending_inc.new_primary_temp[pgid] = osd;
12154     ss << "set " << pgid << " primary_temp mapping to " << osd;
12155     goto update;
12156   } else if (prefix == "pg repeer") {
12157     pg_t pgid;
12158     err = parse_pgid(cmdmap, ss, pgid);
12159     if (err < 0)
12160       goto reply;
12161     vector<int> acting;
12162     int primary;
12163     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12164     if (primary < 0) {
12165       err = -EAGAIN;
12166       ss << "pg currently has no primary";
12167       goto reply;
12168     }
12169     if (acting.size() > 1) {
12170       // map to just primary; it will map back to what it wants
12171       pending_inc.new_pg_temp[pgid] = { primary };
12172     } else {
12173       // hmm, pick another arbitrary osd to induce a change.  Note
12174       // that this won't work if there is only one suitable OSD in the cluster.
12175       int i;
12176       bool done = false;
12177       for (i = 0; i < osdmap.get_max_osd(); ++i) {
12178         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12179           continue;
12180         }
12181         pending_inc.new_pg_temp[pgid] = { primary, i };
12182         done = true;
12183         break;
12184       }
12185       if (!done) {
12186         err = -EAGAIN;
12187         ss << "not enough up OSDs in the cluster to force repeer";
12188         goto reply;
12189       }
12190     }
12191     goto update;
12192   } else if (prefix == "osd pg-upmap" ||
12193              prefix == "osd rm-pg-upmap" ||
12194              prefix == "osd pg-upmap-items" ||
12195              prefix == "osd rm-pg-upmap-items" ||
12196              prefix == "osd pg-upmap-primary" ||
12197              prefix == "osd rm-pg-upmap-primary") {
12198     enum {
12199       OP_PG_UPMAP,
12200       OP_RM_PG_UPMAP,
12201       OP_PG_UPMAP_ITEMS,
12202       OP_RM_PG_UPMAP_ITEMS,
12203       OP_PG_UPMAP_PRIMARY,
12204       OP_RM_PG_UPMAP_PRIMARY,
12205     } upmap_option;
12206
12207     if (prefix == "osd pg-upmap") {
12208       upmap_option = OP_PG_UPMAP;
12209     } else if (prefix == "osd rm-pg-upmap") {
12210       upmap_option = OP_RM_PG_UPMAP;
12211     } else if (prefix == "osd pg-upmap-items") {
12212       upmap_option = OP_PG_UPMAP_ITEMS;
12213     } else if (prefix == "osd rm-pg-upmap-items") {
12214       upmap_option = OP_RM_PG_UPMAP_ITEMS;
12215     } else if (prefix == "osd pg-upmap-primary") {
12216       upmap_option = OP_PG_UPMAP_PRIMARY;
12217     } else if (prefix == "osd rm-pg-upmap-primary") {
12218       upmap_option = OP_RM_PG_UPMAP_PRIMARY;
12219     } else {
12220       ceph_abort_msg("invalid upmap option");
12221     }
12222
12223     ceph_release_t min_release = ceph_release_t::unknown;
12224     string feature_name = "unknown";
12225     switch (upmap_option) {
12226     case OP_PG_UPMAP:           // fall through
12227     case OP_RM_PG_UPMAP:        // fall through
12228     case OP_PG_UPMAP_ITEMS:     // fall through
12229     case OP_RM_PG_UPMAP_ITEMS:
12230       min_release = ceph_release_t::luminous;
12231       feature_name = "pg-upmap";
12232       break;
12233
12234     case OP_PG_UPMAP_PRIMARY:   // fall through
12235     case OP_RM_PG_UPMAP_PRIMARY:
12236       min_release = ceph_release_t::reef;
12237       feature_name = "pg-upmap-primary";
12238       break;
12239
12240     default:
12241       ceph_abort_msg("invalid upmap option");
12242     }
12243     uint64_t min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
12244     string min_release_name = ceph_release_name(static_cast<int>(min_release));
12245
12246     if (osdmap.require_min_compat_client < min_release) {
12247       ss << "min_compat_client "
12248          << osdmap.require_min_compat_client
12249          << " < " << min_release_name << ", which is required for " << feature_name << ". "
12250          << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
12251          << "before using the new interface";
12252       err = -EPERM;
12253       goto reply;
12254     }
12255
12256     //TODO: Should I add feature and test for upmap-primary?
12257     err = check_cluster_features(min_feature, ss);
12258     if (err == -EAGAIN)
12259       goto wait;
12260     if (err < 0)
12261       goto reply;
12262     pg_t pgid;
12263     err = parse_pgid(cmdmap, ss, pgid);
12264     if (err < 0)
12265       goto reply;
12266     if (pending_inc.old_pools.count(pgid.pool())) {
12267       ss << "pool of " << pgid << " is pending removal";
12268       err = -ENOENT;
12269       getline(ss, rs);
12270       wait_for_finished_proposal(op,
12271         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12272       return true;
12273     }
12274
12275     // check pending upmap changes
12276     switch (upmap_option) {
12277     case OP_PG_UPMAP: // fall through
12278     case OP_RM_PG_UPMAP:
12279       if (pending_inc.new_pg_upmap.count(pgid) ||
12280           pending_inc.old_pg_upmap.count(pgid)) {
12281         dout(10) << __func__ << " waiting for pending update on "
12282                  << pgid << dendl;
12283         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12284         return true;
12285       }
12286       break;
12287
12288     case OP_PG_UPMAP_PRIMARY:   // fall through
12289     case OP_RM_PG_UPMAP_PRIMARY:
12290       {
12291         const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
12292         if (! pt->is_replicated()) {
12293           ss << "pg-upmap-primary is only supported for replicated pools";
12294           err = -EINVAL;
12295           goto reply;
12296         }
12297       }
12298       // fall through
12299     case OP_PG_UPMAP_ITEMS:     // fall through
12300     case OP_RM_PG_UPMAP_ITEMS:  // fall through
12301       if (pending_inc.new_pg_upmap_items.count(pgid) ||
12302           pending_inc.old_pg_upmap_items.count(pgid)) {
12303         dout(10) << __func__ << " waiting for pending update on "
12304                  << pgid << dendl;
12305         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12306         return true;
12307       }
12308       break;
12309
12310     default:
12311       ceph_abort_msg("invalid upmap option");
12312     }
12313
12314     switch (upmap_option) {
12315     case OP_PG_UPMAP:
12316       {
12317         vector<int64_t> id_vec;
12318         if (!cmd_getval(cmdmap, "id", id_vec)) {
12319           ss << "unable to parse 'id' value(s) '"
12320              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12321           err = -EINVAL;
12322           goto reply;
12323         }
12324
12325         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12326         if ((int)id_vec.size() < pool_min_size) {
12327           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12328              << pool_min_size << ")";
12329           err = -EINVAL;
12330           goto reply;
12331         }
12332
12333         int pool_size = osdmap.get_pg_pool_size(pgid);
12334         if ((int)id_vec.size() > pool_size) {
12335           ss << "num of osds (" << id_vec.size() <<") > pool size ("
12336              << pool_size << ")";
12337           err = -EINVAL;
12338           goto reply;
12339         }
12340
12341         vector<int32_t> new_pg_upmap;
12342         for (auto osd : id_vec) {
12343           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12344             ss << "osd." << osd << " does not exist";
12345             err = -ENOENT;
12346             goto reply;
12347           }
12348           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12349           if (it != new_pg_upmap.end()) {
12350             ss << "osd." << osd << " already exists, ";
12351             continue;
12352           }
12353           new_pg_upmap.push_back(osd);
12354         }
12355
12356         if (new_pg_upmap.empty()) {
12357           ss << "no valid upmap items(pairs) is specified";
12358           err = -EINVAL;
12359           goto reply;
12360         }
12361
12362         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12363           new_pg_upmap.begin(), new_pg_upmap.end());
12364         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12365       }
12366       break;
12367
12368     case OP_RM_PG_UPMAP:
12369       {
12370         pending_inc.old_pg_upmap.insert(pgid);
12371         ss << "clear " << pgid << " pg_upmap mapping";
12372       }
12373       break;
12374
12375     case OP_PG_UPMAP_ITEMS:
12376       {
12377         vector<int64_t> id_vec;
12378         if (!cmd_getval(cmdmap, "id", id_vec)) {
12379           ss << "unable to parse 'id' value(s) '"
12380              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12381           err = -EINVAL;
12382           goto reply;
12383         }
12384
12385         if (id_vec.size() % 2) {
12386           ss << "you must specify pairs of osd ids to be remapped";
12387           err = -EINVAL;
12388           goto reply;
12389         }
12390
12391         int pool_size = osdmap.get_pg_pool_size(pgid);
12392         if ((int)(id_vec.size() / 2) > pool_size) {
12393           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12394              << pool_size << ")";
12395           err = -EINVAL;
12396           goto reply;
12397         }
12398
12399         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12400         ostringstream items;
12401         items << "[";
12402         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12403           int from = *p++;
12404           int to = *p;
12405           if (from == to) {
12406             ss << "from osd." << from << " == to osd." << to << ", ";
12407             continue;
12408           }
12409           if (!osdmap.exists(from)) {
12410             ss << "osd." << from << " does not exist";
12411             err = -ENOENT;
12412             goto reply;
12413           }
12414           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12415             ss << "osd." << to << " does not exist";
12416             err = -ENOENT;
12417             goto reply;
12418           }
12419           pair<int32_t,int32_t> entry = make_pair(from, to);
12420           auto it = std::find(new_pg_upmap_items.begin(),
12421             new_pg_upmap_items.end(), entry);
12422           if (it != new_pg_upmap_items.end()) {
12423             ss << "osd." << from << " -> osd." << to << " already exists, ";
12424             continue;
12425           }
12426           new_pg_upmap_items.push_back(entry);
12427           items << from << "->" << to << ",";
12428         }
12429         string out(items.str());
12430         out.resize(out.size() - 1); // drop last ','
12431         out += "]";
12432
12433         if (new_pg_upmap_items.empty()) {
12434           ss << "no valid upmap items(pairs) is specified";
12435           err = -EINVAL;
12436           goto reply;
12437         }
12438
12439         pending_inc.new_pg_upmap_items[pgid] =
12440           mempool::osdmap::vector<pair<int32_t,int32_t>>(
12441           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12442         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12443       }
12444       break;
12445
12446     case OP_RM_PG_UPMAP_ITEMS:
12447       {
12448         pending_inc.old_pg_upmap_items.insert(pgid);
12449         ss << "clear " << pgid << " pg_upmap_items mapping";
12450       }
12451       break;
12452
12453     case OP_PG_UPMAP_PRIMARY:
12454       {
12455         int64_t id;
12456         if (!cmd_getval(cmdmap, "id", id)) {
12457           ss << "invalid osd id value '"
12458              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12459           err = -EINVAL;
12460           goto reply;
12461         }
12462         if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
12463           ss << "osd." << id << " does not exist";
12464           err = -ENOENT;
12465           goto reply;
12466         }
12467         vector<int> acting;
12468         int primary;
12469         osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12470         if (id == primary) {
12471           ss << "osd." << id << " is already primary for pg " << pgid;
12472           err = -EINVAL;
12473           goto reply;
12474         }
12475         int found_idx = 0;
12476         for (int i = 1 ; i < (int)acting.size(); i++) {  // skip 0 on purpose
12477           if (acting[i] == id) {
12478             found_idx = i;
12479             break;
12480           }
12481         }
12482         if (found_idx == 0) {
12483           ss << "osd." << id << " is not in acting set for pg " << pgid;
12484           err = -EINVAL;
12485           goto reply;
12486         }
12487         vector<int> new_acting(acting);
12488         new_acting[found_idx] = new_acting[0];
12489         new_acting[0] = id;
12490         int pool_size = osdmap.get_pg_pool_size(pgid);
12491         if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
12492             pool_size, new_acting) >= 0) {
12493           ss << "change primary for pg " << pgid << " to osd." << id;
12494         }
12495         else {
12496           ss << "can't change primary for pg " << pgid << " to osd." << id
12497              << " - illegal pg after the change";
12498           err = -EINVAL;
12499           goto reply;
12500         }
12501         pending_inc.new_pg_upmap_primary[pgid] = id;
12502         //TO-REMOVE:
12503         ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
12504       }
12505       break;
12506
12507     case OP_RM_PG_UPMAP_PRIMARY:
12508       {
12509         pending_inc.old_pg_upmap_primary.insert(pgid);
12510         ss << "clear " << pgid << " pg_upmap_primary mapping";
12511       }
12512       break;
12513
12514     default:
12515       ceph_abort_msg("invalid upmap option");
12516     }
12517
12518     goto update;
12519   } else if (prefix == "osd primary-affinity") {
12520     int64_t id;
12521     if (!cmd_getval(cmdmap, "id", id)) {
12522       ss << "invalid osd id value '"
12523          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12524       err = -EINVAL;
12525       goto reply;
12526     }
12527     double w;
12528     if (!cmd_getval(cmdmap, "weight", w)) {
12529       ss << "unable to parse 'weight' value '"
12530          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12531       err = -EINVAL;
12532       goto reply;
12533     }
12534     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12535     if (ww < 0L) {
12536       ss << "weight must be >= 0";
12537       err = -EINVAL;
12538       goto reply;
12539     }
12540     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12541         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12542       ss << "require_min_compat_client "
12543          << osdmap.require_min_compat_client
12544          << " < firefly, which is required for primary-affinity";
12545       err = -EPERM;
12546       goto reply;
12547     }
12548     if (osdmap.exists(id)) {
12549       pending_inc.new_primary_affinity[id] = ww;
12550       ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12551       getline(ss, rs);
12552       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12553                                                 get_last_committed() + 1));
12554       return true;
12555     } else {
12556       ss << "osd." << id << " does not exist";
12557       err = -ENOENT;
12558       goto reply;
12559     }
12560   } else if (prefix == "osd reweight") {
12561     int64_t id;
12562     if (!cmd_getval(cmdmap, "id", id)) {
12563       ss << "unable to parse osd id value '"
12564          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12565       err = -EINVAL;
12566       goto reply;
12567     }
12568     double w;
12569     if (!cmd_getval(cmdmap, "weight", w)) {
12570       ss << "unable to parse weight value '"
12571          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12572       err = -EINVAL;
12573       goto reply;
12574     }
12575     long ww = (int)((double)CEPH_OSD_IN*w);
12576     if (ww < 0L) {
12577       ss << "weight must be >= 0";
12578       err = -EINVAL;
12579       goto reply;
12580     }
12581     if (osdmap.exists(id)) {
12582       pending_inc.new_weight[id] = ww;
12583       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12584       getline(ss, rs);
12585       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12586                                                 get_last_committed() + 1));
12587       return true;
12588     } else {
12589       ss << "osd." << id << " does not exist";
12590       err = -ENOENT;
12591       goto reply;
12592     }
12593   } else if (prefix == "osd reweightn") {
12594     map<int32_t, uint32_t> weights;
12595     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12596     if (err) {
12597       ss << "unable to parse 'weights' value '"
12598          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12599       goto reply;
12600     }
12601     pending_inc.new_weight.insert(weights.begin(), weights.end());
12602     wait_for_finished_proposal(
12603         op,
12604         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12605     return true;
12606   } else if (prefix == "osd lost") {
12607     int64_t id;
12608     if (!cmd_getval(cmdmap, "id", id)) {
12609       ss << "unable to parse osd id value '"
12610          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12611       err = -EINVAL;
12612       goto reply;
12613     }
12614     bool sure = false;
12615     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12616     if (!sure) {
12617       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12618             "--yes-i-really-mean-it if you really do.";
12619       err = -EPERM;
12620       goto reply;
12621     } else if (!osdmap.exists(id)) {
12622       ss << "osd." << id << " does not exist";
12623       err = -ENOENT;
12624       goto reply;
12625     } else if (!osdmap.is_down(id)) {
12626       ss << "osd." << id << " is not down";
12627       err = -EBUSY;
12628       goto reply;
12629     } else {
12630       epoch_t e = osdmap.get_info(id).down_at;
12631       pending_inc.new_lost[id] = e;
12632       ss << "marked osd lost in epoch " << e;
12633       getline(ss, rs);
12634       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12635                                                 get_last_committed() + 1));
12636       return true;
12637     }
12638
12639   } else if (prefix == "osd destroy-actual" ||
12640              prefix == "osd purge-actual" ||
12641              prefix == "osd purge-new") {
12642     /* Destroying an OSD means that we don't expect to further make use of
12643      * the OSDs data (which may even become unreadable after this operation),
12644      * and that we are okay with scrubbing all its cephx keys and config-key
12645      * data (which may include lockbox keys, thus rendering the osd's data
12646      * unreadable).
12647      *
12648      * The OSD will not be removed. Instead, we will mark it as destroyed,
12649      * such that a subsequent call to `create` will not reuse the osd id.
12650      * This will play into being able to recreate the OSD, at the same
12651      * crush location, with minimal data movement.
12652      */
12653
12654     // make sure authmon is writeable.
12655     if (!mon.authmon()->is_writeable()) {
12656       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12657                << "osd destroy" << dendl;
12658       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12659       return false;
12660     }
12661
12662     int64_t id;
12663     if (!cmd_getval(cmdmap, "id", id)) {
12664       auto p = cmdmap.find("id");
12665       if (p == cmdmap.end()) {
12666         ss << "no osd id specified";
12667       } else {
12668         ss << "unable to parse osd id value '"
12669            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12670       }
12671       err = -EINVAL;
12672       goto reply;
12673     }
12674
12675     bool is_destroy = (prefix == "osd destroy-actual");
12676     if (!is_destroy) {
12677       ceph_assert("osd purge-actual" == prefix ||
12678              "osd purge-new" == prefix);
12679     }
12680
12681     bool sure = false;
12682     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12683     if (!sure) {
12684       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12685          << "This will mean real, permanent data loss, as well "
12686          << "as deletion of cephx and lockbox keys. "
12687          << "Pass --yes-i-really-mean-it if you really do.";
12688       err = -EPERM;
12689       goto reply;
12690     } else if (!osdmap.exists(id)) {
12691       ss << "osd." << id << " does not exist";
12692       err = 0; // idempotent
12693       goto reply;
12694     } else if (osdmap.is_up(id)) {
12695       ss << "osd." << id << " is not `down`.";
12696       err = -EBUSY;
12697       goto reply;
12698     } else if (is_destroy && osdmap.is_destroyed(id)) {
12699       ss << "destroyed osd." << id;
12700       err = 0;
12701       goto reply;
12702     }
12703
12704     if (prefix == "osd purge-new" &&
12705         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12706       ss << "osd." << id << " is not new";
12707       err = -EPERM;
12708       goto reply;
12709     }
12710
12711     bool goto_reply = false;
12712
12713     paxos.plug();
12714     if (is_destroy) {
12715       err = prepare_command_osd_destroy(id, ss);
12716       // we checked above that it should exist.
12717       ceph_assert(err != -ENOENT);
12718     } else {
12719       err = prepare_command_osd_purge(id, ss);
12720       if (err == -ENOENT) {
12721         err = 0;
12722         ss << "osd." << id << " does not exist.";
12723         goto_reply = true;
12724       }
12725     }
12726     paxos.unplug();
12727
12728     if (err < 0 || goto_reply) {
12729       goto reply;
12730     }
12731
12732     if (is_destroy) {
12733       ss << "destroyed osd." << id;
12734     } else {
12735       ss << "purged osd." << id;
12736     }
12737
12738     getline(ss, rs);
12739     wait_for_finished_proposal(op,
12740         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12741     force_immediate_propose();
12742     return true;
12743
12744   } else if (prefix == "osd new") {
12745
12746     // make sure authmon is writeable.
12747     if (!mon.authmon()->is_writeable()) {
12748       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12749                << "osd new" << dendl;
12750       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12751       return false;
12752     }
12753
12754     // make sure kvmon is writeable.
12755     if (!mon.kvmon()->is_writeable()) {
12756       dout(10) << __func__ << " waiting for kv mon to be writeable for "
12757                << "osd new" << dendl;
12758       mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12759       return false;
12760     }
12761
12762     map<string,string> param_map;
12763
12764     bufferlist bl = m->get_data();
12765     string param_json = bl.to_str();
12766     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12767
12768     err = get_json_str_map(param_json, ss, &param_map);
12769     if (err < 0)
12770       goto reply;
12771
12772     dout(20) << __func__ << " osd new params " << param_map << dendl;
12773
12774     paxos.plug();
12775     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12776     paxos.unplug();
12777
12778     if (err < 0) {
12779       goto reply;
12780     }
12781
12782     if (f) {
12783       f->flush(rdata);
12784     } else {
12785       rdata.append(ss);
12786     }
12787
12788     if (err == EEXIST) {
12789       // idempotent operation
12790       err = 0;
12791       goto reply;
12792     }
12793
12794     wait_for_finished_proposal(op,
12795         new Monitor::C_Command(mon, op, 0, rs, rdata,
12796                                get_last_committed() + 1));
12797     force_immediate_propose();
12798     return true;
12799
12800   } else if (prefix == "osd create") {
12801
12802     // optional id provided?
12803     int64_t id = -1, cmd_id = -1;
12804     if (cmd_getval(cmdmap, "id", cmd_id)) {
12805       if (cmd_id < 0) {
12806         ss << "invalid osd id value '" << cmd_id << "'";
12807         err = -EINVAL;
12808         goto reply;
12809       }
12810       dout(10) << " osd create got id " << cmd_id << dendl;
12811     }
12812
12813     uuid_d uuid;
12814     string uuidstr;
12815     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12816       if (!uuid.parse(uuidstr.c_str())) {
12817         ss << "invalid uuid value '" << uuidstr << "'";
12818         err = -EINVAL;
12819         goto reply;
12820       }
12821       // we only care about the id if we also have the uuid, to
12822       // ensure the operation's idempotency.
12823       id = cmd_id;
12824     }
12825
12826     int32_t new_id = -1;
12827     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12828     if (err < 0) {
12829       if (err == -EAGAIN) {
12830         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12831         return true;
12832       }
12833       // a check has failed; reply to the user.
12834       goto reply;
12835
12836     } else if (err == EEXIST) {
12837       // this is an idempotent operation; we can go ahead and reply.
12838       if (f) {
12839         f->open_object_section("created_osd");
12840         f->dump_int("osdid", new_id);
12841         f->close_section();
12842         f->flush(rdata);
12843       } else {
12844         ss << new_id;
12845         rdata.append(ss);
12846       }
12847       err = 0;
12848       goto reply;
12849     }
12850
12851     string empty_device_class;
12852     do_osd_create(id, uuid, empty_device_class, &new_id);
12853
12854     if (f) {
12855       f->open_object_section("created_osd");
12856       f->dump_int("osdid", new_id);
12857       f->close_section();
12858       f->flush(rdata);
12859     } else {
12860       ss << new_id;
12861       rdata.append(ss);
12862     }
12863     wait_for_finished_proposal(op,
12864         new Monitor::C_Command(mon, op, 0, rs, rdata,
12865                                get_last_committed() + 1));
12866     return true;
12867
12868   } else if (prefix == "osd blocklist clear" ||
12869              prefix == "osd blacklist clear") {
12870     pending_inc.new_blocklist.clear();
12871     std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12872     std::list<std::pair<entity_addr_t,utime_t > > range_b;
12873     osdmap.get_blocklist(&blocklist, &range_b);
12874     for (const auto &entry : blocklist) {
12875       pending_inc.old_blocklist.push_back(entry.first);
12876     }
12877     for (const auto &entry : range_b) {
12878       pending_inc.old_range_blocklist.push_back(entry.first);
12879     }
12880     ss << " removed all blocklist entries";
12881     getline(ss, rs);
12882     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12883                                               get_last_committed() + 1));
12884     return true;
12885   } else if (prefix == "osd blocklist" ||
12886              prefix == "osd blacklist") {
12887     string addrstr, rangestr;
12888     bool range = false;
12889     cmd_getval(cmdmap, "addr", addrstr);
12890     if (cmd_getval(cmdmap, "range", rangestr)) {
12891       if (rangestr == "range") {
12892         range = true;
12893       } else {
12894         ss << "Did you mean to specify \"osd blocklist range\"?";
12895         err = -EINVAL;
12896         goto reply;
12897       }
12898     }
12899     entity_addr_t addr;
12900     if (!addr.parse(addrstr)) {
12901       ss << "unable to parse address " << addrstr;
12902       err = -EINVAL;
12903       goto reply;
12904     }
12905     else {
12906       if (range) {
12907         if (!addr.maybe_cidr()) {
12908           ss << "You specified a range command, but " << addr
12909              << " does not parse as a CIDR range";
12910           err = -EINVAL;
12911           goto reply;
12912         }
12913         addr.type = entity_addr_t::TYPE_CIDR;
12914         err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12915         if (err) {
12916           goto reply;
12917         }
12918         if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12919             (addr.is_ipv6() && addr.get_nonce() > 128)) {
12920           ss << "Too many bits in range for that protocol!";
12921           err = -EINVAL;
12922           goto reply;
12923         }
12924       } else {
12925         if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12926           // always blocklist type ANY
12927           addr.set_type(entity_addr_t::TYPE_ANY);
12928         } else {
12929           addr.set_type(entity_addr_t::TYPE_LEGACY);
12930         }
12931       }
12932
12933       string blocklistop;
12934       if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12935         cmd_getval(cmdmap, "blacklistop", blocklistop);
12936       }
12937       if (blocklistop == "add") {
12938         utime_t expires = ceph_clock_now();
12939         // default one hour
12940         double d = cmd_getval_or<double>(cmdmap, "expire",
12941           g_conf()->mon_osd_blocklist_default_expire);
12942         expires += d;
12943
12944         auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12945                                             const auto& addr,
12946                                             const auto& expires) {
12947           nb[addr] = expires;
12948           // cancel any pending un-blocklisting request too
12949           auto it = std::find(ob.begin(),
12950                               ob.end(), addr);
12951           if (it != ob.end()) {
12952             ob.erase(it);
12953           }
12954         };
12955         if (range) {
12956           add_to_pending_blocklists(pending_inc.new_range_blocklist,
12957                                     pending_inc.old_range_blocklist,
12958                                     addr, expires);
12959
12960         } else {
12961           add_to_pending_blocklists(pending_inc.new_blocklist,
12962                                     pending_inc.old_blocklist,
12963                                     addr, expires);
12964         }
12965
12966         ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12967         getline(ss, rs);
12968         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12969                                                   get_last_committed() + 1));
12970         return true;
12971       } else if (blocklistop == "rm") {
12972         auto rm_from_pending_blocklists = [](const auto& addr,
12973                                              auto& blocklist,
12974                                              auto& ob, auto& pb) {
12975           if (blocklist.count(addr)) {
12976             ob.push_back(addr);
12977             return true;
12978           } else if (pb.count(addr)) {
12979             pb.erase(addr);
12980             return true;
12981           }
12982           return false;
12983         };
12984         if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12985                                                   pending_inc.old_blocklist,
12986                                                   pending_inc.new_blocklist)) ||
12987             (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12988                                                  pending_inc.old_range_blocklist,
12989                                                  pending_inc.new_range_blocklist))) {
12990           ss << "un-blocklisting " << addr;
12991           getline(ss, rs);
12992           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12993                                                     get_last_committed() + 1));
12994           return true;
12995         }
12996         ss << addr << " isn't blocklisted";
12997         err = 0;
12998         goto reply;
12999       }
13000     }
13001   } else if (prefix == "osd pool mksnap") {
13002     string poolstr;
13003     cmd_getval(cmdmap, "pool", poolstr);
13004     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13005     if (pool < 0) {
13006       ss << "unrecognized pool '" << poolstr << "'";
13007       err = -ENOENT;
13008       goto reply;
13009     }
13010     string snapname;
13011     cmd_getval(cmdmap, "snap", snapname);
13012     const pg_pool_t *p = osdmap.get_pg_pool(pool);
13013     if (p->is_unmanaged_snaps_mode()) {
13014       ss << "pool " << poolstr << " is in unmanaged snaps mode";
13015       err = -EINVAL;
13016       goto reply;
13017     } else if (p->snap_exists(snapname.c_str())) {
13018       ss << "pool " << poolstr << " snap " << snapname << " already exists";
13019       err = 0;
13020       goto reply;
13021     } else if (p->is_tier()) {
13022       ss << "pool " << poolstr << " is a cache tier";
13023       err = -EINVAL;
13024       goto reply;
13025     }
13026     pg_pool_t *pp = 0;
13027     if (pending_inc.new_pools.count(pool))
13028       pp = &pending_inc.new_pools[pool];
13029     if (!pp) {
13030       pp = &pending_inc.new_pools[pool];
13031       *pp = *p;
13032     }
13033     if (pp->snap_exists(snapname.c_str())) {
13034       ss << "pool " << poolstr << " snap " << snapname << " already exists";
13035     } else {
13036       if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(pool)) {
13037         dout(20) << "pool-level snapshots have been disabled for pools "
13038                     "attached to an fs - poolid:" << pool << dendl;
13039         err = -EOPNOTSUPP;
13040         goto reply;
13041       }
13042       pp->add_snap(snapname.c_str(), ceph_clock_now());
13043       pp->set_snap_epoch(pending_inc.epoch);
13044       ss << "created pool " << poolstr << " snap " << snapname;
13045     }
13046     getline(ss, rs);
13047     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13048                                               get_last_committed() + 1));
13049     return true;
13050   } else if (prefix == "osd pool rmsnap") {
13051     string poolstr;
13052     cmd_getval(cmdmap, "pool", poolstr);
13053     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13054     if (pool < 0) {
13055       ss << "unrecognized pool '" << poolstr << "'";
13056       err = -ENOENT;
13057       goto reply;
13058     }
13059     string snapname;
13060     cmd_getval(cmdmap, "snap", snapname);
13061     const pg_pool_t *p = osdmap.get_pg_pool(pool);
13062     if (p->is_unmanaged_snaps_mode()) {
13063       ss << "pool " << poolstr << " is in unmanaged snaps mode";
13064       err = -EINVAL;
13065       goto reply;
13066     } else if (!p->snap_exists(snapname.c_str())) {
13067       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
13068       err = 0;
13069       goto reply;
13070     }
13071     pg_pool_t *pp = 0;
13072     if (pending_inc.new_pools.count(pool))
13073       pp = &pending_inc.new_pools[pool];
13074     if (!pp) {
13075       pp = &pending_inc.new_pools[pool];
13076       *pp = *p;
13077     }
13078     snapid_t sn = pp->snap_exists(snapname.c_str());
13079     if (sn) {
13080       pp->remove_snap(sn);
13081       pp->set_snap_epoch(pending_inc.epoch);
13082       ss << "removed pool " << poolstr << " snap " << snapname;
13083     } else {
13084       ss << "already removed pool " << poolstr << " snap " << snapname;
13085     }
13086     getline(ss, rs);
13087     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13088                                               get_last_committed() + 1));
13089     return true;
13090   } else if (prefix == "osd pool create") {
13091     int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
13092     int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
13093     int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
13094     int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
13095     string pool_type_str;
13096     cmd_getval(cmdmap, "pool_type", pool_type_str);
13097     if (pool_type_str.empty())
13098       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
13099
13100     string poolstr;
13101     cmd_getval(cmdmap, "pool", poolstr);
13102     bool confirm = false;
13103     //confirmation may be set to true only by internal operations.
13104     cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13105     if (poolstr[0] == '.' && !confirm) {
13106       ss << "pool names beginning with . are not allowed";
13107       err = 0;
13108       goto reply;
13109     }
13110     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13111     if (pool_id >= 0) {
13112       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13113       if (pool_type_str != p->get_type_name()) {
13114         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
13115         err = -EINVAL;
13116       } else {
13117         ss << "pool '" << poolstr << "' already exists";
13118         err = 0;
13119       }
13120       goto reply;
13121     }
13122
13123     int pool_type;
13124     if (pool_type_str == "replicated") {
13125       pool_type = pg_pool_t::TYPE_REPLICATED;
13126     } else if (pool_type_str == "erasure") {
13127       pool_type = pg_pool_t::TYPE_ERASURE;
13128     } else {
13129       ss << "unknown pool type '" << pool_type_str << "'";
13130       err = -EINVAL;
13131       goto reply;
13132     }
13133
13134     bool implicit_rule_creation = false;
13135     int64_t expected_num_objects = 0;
13136     string rule_name;
13137     cmd_getval(cmdmap, "rule", rule_name);
13138     string erasure_code_profile;
13139     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
13140
13141     if (pool_type == pg_pool_t::TYPE_ERASURE) {
13142       if (erasure_code_profile == "")
13143         erasure_code_profile = "default";
13144       //handle the erasure code profile
13145       if (erasure_code_profile == "default") {
13146         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
13147           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
13148             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
13149             goto wait;
13150           }
13151
13152           map<string,string> profile_map;
13153           err = osdmap.get_erasure_code_profile_default(cct,
13154                                                       profile_map,
13155                                                       &ss);
13156           if (err)
13157             goto reply;
13158           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
13159           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
13160           goto wait;
13161         }
13162       }
13163       if (rule_name == "") {
13164         implicit_rule_creation = true;
13165         if (erasure_code_profile == "default") {
13166           rule_name = "erasure-code";
13167         } else {
13168           dout(1) << "implicitly use rule named after the pool: "
13169                 << poolstr << dendl;
13170           rule_name = poolstr;
13171         }
13172       }
13173       expected_num_objects =
13174         cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13175     } else {
13176       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13177       //     and put expected_num_objects to rule field
13178       if (erasure_code_profile != "") { // cmd is from CLI
13179         if (rule_name != "") {
13180           string interr;
13181           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13182           if (interr.length()) {
13183             ss << "error parsing integer value '" << rule_name << "': " << interr;
13184             err = -EINVAL;
13185             goto reply;
13186           }
13187         }
13188         rule_name = erasure_code_profile;
13189       } else { // cmd is well-formed
13190         expected_num_objects =
13191           cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13192       }
13193     }
13194
13195     if (!implicit_rule_creation && rule_name != "") {
13196       int rule;
13197       err = get_crush_rule(rule_name, &rule, &ss);
13198       if (err == -EAGAIN) {
13199         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13200         return true;
13201       }
13202       if (err)
13203         goto reply;
13204     }
13205
13206     if (expected_num_objects < 0) {
13207       ss << "'expected_num_objects' must be non-negative";
13208       err = -EINVAL;
13209       goto reply;
13210     }
13211
13212     set<int32_t> osds;
13213     osdmap.get_all_osds(osds);
13214     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13215       string type;
13216       if (!get_osd_objectstore_type(osd, &type)) {
13217         return type == "filestore";
13218       } else {
13219         return false;
13220       }
13221     });
13222
13223     if (has_filestore_osd &&
13224         expected_num_objects > 0 &&
13225         cct->_conf->filestore_merge_threshold > 0) {
13226       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13227       err = -EINVAL;
13228       goto reply;
13229     }
13230
13231     if (has_filestore_osd &&
13232         expected_num_objects == 0 &&
13233         cct->_conf->filestore_merge_threshold < 0) {
13234       int osds = osdmap.get_num_osds();
13235       bool sure = false;
13236       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13237       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13238         ss << "For better initial performance on pools expected to store a "
13239            << "large number of objects, consider supplying the "
13240            << "expected_num_objects parameter when creating the pool."
13241            << " Pass --yes-i-really-mean-it to ignore it";
13242         err = -EPERM;
13243         goto reply;
13244       }
13245     }
13246
13247     int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13248     FastReadType fast_read = FAST_READ_DEFAULT;
13249     if (fast_read_param == 0)
13250       fast_read = FAST_READ_OFF;
13251     else if (fast_read_param > 0)
13252       fast_read = FAST_READ_ON;
13253
13254     int64_t repl_size = 0;
13255     cmd_getval(cmdmap, "size", repl_size);
13256     int64_t target_size_bytes = 0;
13257     double target_size_ratio = 0.0;
13258     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13259     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13260
13261     string pg_autoscale_mode;
13262     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13263
13264     bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13265
13266     bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
13267       cct->_conf.get_val<bool>("osd_pool_default_crimson");
13268
13269     err = prepare_new_pool(poolstr,
13270                            -1, // default crush rule
13271                            rule_name,
13272                            pg_num, pgp_num, pg_num_min, pg_num_max,
13273                            repl_size, target_size_bytes, target_size_ratio,
13274                            erasure_code_profile, pool_type,
13275                            (uint64_t)expected_num_objects,
13276                            fast_read,
13277                            pg_autoscale_mode,
13278                            bulk,
13279                            crimson,
13280                            &ss);
13281     if (err < 0) {
13282       switch(err) {
13283       case -EEXIST:
13284         ss << "pool '" << poolstr << "' already exists";
13285         break;
13286       case -EAGAIN:
13287         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13288         return true;
13289       case -ERANGE:
13290         goto reply;
13291       default:
13292         goto reply;
13293         break;
13294       }
13295     } else {
13296       ss << "pool '" << poolstr << "' created";
13297     }
13298     getline(ss, rs);
13299     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13300                                               get_last_committed() + 1));
13301     return true;
13302
13303   } else if (prefix == "osd pool delete" ||
13304              prefix == "osd pool rm") {
13305     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13306     string poolstr, poolstr2, sure;
13307     cmd_getval(cmdmap, "pool", poolstr);
13308     cmd_getval(cmdmap, "pool2", poolstr2);
13309     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13310     if (pool < 0) {
13311       ss << "pool '" << poolstr << "' does not exist";
13312       err = 0;
13313       goto reply;
13314     }
13315
13316     bool force_no_fake = false;
13317     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13318     bool force = false;
13319     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13320     if (poolstr2 != poolstr ||
13321         (!force && !force_no_fake)) {
13322       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13323          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13324          << "followed by --yes-i-really-really-mean-it.";
13325       err = -EPERM;
13326       goto reply;
13327     }
13328     err = _prepare_remove_pool(pool, &ss, force_no_fake);
13329     if (err == -EAGAIN) {
13330       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13331       return true;
13332     }
13333     if (err < 0)
13334       goto reply;
13335     goto update;
13336   } else if (prefix == "osd pool rename") {
13337     string srcpoolstr, destpoolstr;
13338     cmd_getval(cmdmap, "srcpool", srcpoolstr);
13339     cmd_getval(cmdmap, "destpool", destpoolstr);
13340     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13341     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13342     bool confirm = false;
13343     //confirmation may be set to true only by internal operations.
13344     cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13345     if (destpoolstr[0] == '.' && !confirm) {
13346       ss << "pool names beginning with . are not allowed";
13347       err = 0;
13348       goto reply;
13349     }
13350     if (pool_src < 0) {
13351       if (pool_dst >= 0) {
13352         // src pool doesn't exist, dst pool does exist: to ensure idempotency
13353         // of operations, assume this rename succeeded, as it is not changing
13354         // the current state.  Make sure we output something understandable
13355         // for whoever is issuing the command, if they are paying attention,
13356         // in case it was not intentional; or to avoid a "wtf?" and a bug
13357         // report in case it was intentional, while expecting a failure.
13358         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13359           << destpoolstr << "' does -- assuming successful rename";
13360         err = 0;
13361       } else {
13362         ss << "unrecognized pool '" << srcpoolstr << "'";
13363         err = -ENOENT;
13364       }
13365       goto reply;
13366     } else if (pool_dst >= 0) {
13367       // source pool exists and so does the destination pool
13368       ss << "pool '" << destpoolstr << "' already exists";
13369       err = -EEXIST;
13370       goto reply;
13371     }
13372
13373     int ret = _prepare_rename_pool(pool_src, destpoolstr);
13374     if (ret == 0) {
13375       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13376     } else {
13377       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13378         << cpp_strerror(ret);
13379     }
13380     getline(ss, rs);
13381     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13382                                               get_last_committed() + 1));
13383     return true;
13384
13385   } else if (prefix == "osd pool set") {
13386     err = prepare_command_pool_set(cmdmap, ss);
13387     if (err == -EAGAIN)
13388       goto wait;
13389     if (err < 0)
13390       goto reply;
13391
13392     getline(ss, rs);
13393     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13394                                                    get_last_committed() + 1));
13395     return true;
13396   } else if (prefix == "osd tier add") {
13397     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13398     if (err == -EAGAIN)
13399       goto wait;
13400     if (err)
13401       goto reply;
13402     string poolstr;
13403     cmd_getval(cmdmap, "pool", poolstr);
13404     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13405     if (pool_id < 0) {
13406       ss << "unrecognized pool '" << poolstr << "'";
13407       err = -ENOENT;
13408       goto reply;
13409     }
13410     string tierpoolstr;
13411     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13412     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13413     if (tierpool_id < 0) {
13414       ss << "unrecognized pool '" << tierpoolstr << "'";
13415       err = -ENOENT;
13416       goto reply;
13417     }
13418     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13419     ceph_assert(p);
13420     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13421     ceph_assert(tp);
13422
13423     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13424       goto reply;
13425     }
13426
13427     // make sure new tier is empty
13428     bool force_nonempty = false;
13429     cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13430     const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13431     if (pstats && pstats->stats.sum.num_objects != 0 &&
13432         !force_nonempty) {
13433       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13434       err = -ENOTEMPTY;
13435       goto reply;
13436     }
13437     if (tp->is_erasure()) {
13438       ss << "tier pool '" << tierpoolstr
13439          << "' is an ec pool, which cannot be a tier";
13440       err = -ENOTSUP;
13441       goto reply;
13442     }
13443     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13444         (!force_nonempty ||
13445          !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13446       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13447       err = -ENOTEMPTY;
13448       goto reply;
13449     }
13450     // go
13451     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13452     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13453     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13454       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13455       return true;
13456     }
13457     np->tiers.insert(tierpool_id);
13458     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13459     ntp->tier_of = pool_id;
13460     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13461     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13462                                               get_last_committed() + 1));
13463     return true;
13464   } else if (prefix == "osd tier remove" ||
13465              prefix == "osd tier rm") {
13466     string poolstr;
13467     cmd_getval(cmdmap, "pool", poolstr);
13468     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13469     if (pool_id < 0) {
13470       ss << "unrecognized pool '" << poolstr << "'";
13471       err = -ENOENT;
13472       goto reply;
13473     }
13474     string tierpoolstr;
13475     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13476     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13477     if (tierpool_id < 0) {
13478       ss << "unrecognized pool '" << tierpoolstr << "'";
13479       err = -ENOENT;
13480       goto reply;
13481     }
13482     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13483     ceph_assert(p);
13484     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13485     ceph_assert(tp);
13486
13487     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13488       goto reply;
13489     }
13490
13491     if (p->tiers.count(tierpool_id) == 0) {
13492       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13493       err = 0;
13494       goto reply;
13495     }
13496     if (tp->tier_of != pool_id) {
13497       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13498          << osdmap.get_pool_name(tp->tier_of) << "': "
13499          // be scary about it; this is an inconsistency and bells must go off
13500          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13501       err = -EINVAL;
13502       goto reply;
13503     }
13504     if (p->read_tier == tierpool_id) {
13505       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13506       err = -EBUSY;
13507       goto reply;
13508     }
13509     // go
13510     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13511     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13512     if (np->tiers.count(tierpool_id) == 0 ||
13513         ntp->tier_of != pool_id ||
13514         np->read_tier == tierpool_id) {
13515       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13516       return true;
13517     }
13518     np->tiers.erase(tierpool_id);
13519     ntp->clear_tier();
13520     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13521     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13522                                               get_last_committed() + 1));
13523     return true;
13524   } else if (prefix == "osd tier set-overlay") {
13525     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13526     if (err == -EAGAIN)
13527       goto wait;
13528     if (err)
13529       goto reply;
13530     string poolstr;
13531     cmd_getval(cmdmap, "pool", poolstr);
13532     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13533     if (pool_id < 0) {
13534       ss << "unrecognized pool '" << poolstr << "'";
13535       err = -ENOENT;
13536       goto reply;
13537     }
13538     string overlaypoolstr;
13539     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13540     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13541     if (overlaypool_id < 0) {
13542       ss << "unrecognized pool '" << overlaypoolstr << "'";
13543       err = -ENOENT;
13544       goto reply;
13545     }
13546     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13547     ceph_assert(p);
13548     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13549     ceph_assert(overlay_p);
13550     if (p->tiers.count(overlaypool_id) == 0) {
13551       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13552       err = -EINVAL;
13553       goto reply;
13554     }
13555     if (p->read_tier == overlaypool_id) {
13556       err = 0;
13557       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13558       goto reply;
13559     }
13560     if (p->has_read_tier()) {
13561       ss << "pool '" << poolstr << "' has overlay '"
13562          << osdmap.get_pool_name(p->read_tier)
13563          << "'; please remove-overlay first";
13564       err = -EINVAL;
13565       goto reply;
13566     }
13567
13568     // go
13569     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13570     np->read_tier = overlaypool_id;
13571     np->write_tier = overlaypool_id;
13572     np->set_last_force_op_resend(pending_inc.epoch);
13573     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13574     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13575     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13576     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13577       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13578     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13579                                               get_last_committed() + 1));
13580     return true;
13581   } else if (prefix == "osd tier remove-overlay" ||
13582              prefix == "osd tier rm-overlay") {
13583     string poolstr;
13584     cmd_getval(cmdmap, "pool", poolstr);
13585     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13586     if (pool_id < 0) {
13587       ss << "unrecognized pool '" << poolstr << "'";
13588       err = -ENOENT;
13589       goto reply;
13590     }
13591     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13592     ceph_assert(p);
13593     if (!p->has_read_tier()) {
13594       err = 0;
13595       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13596       goto reply;
13597     }
13598
13599     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13600       goto reply;
13601     }
13602
13603     // go
13604     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13605     if (np->has_read_tier()) {
13606       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13607       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13608       nop->set_last_force_op_resend(pending_inc.epoch);
13609     }
13610     if (np->has_write_tier()) {
13611       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13612       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13613       nop->set_last_force_op_resend(pending_inc.epoch);
13614     }
13615     np->clear_read_tier();
13616     np->clear_write_tier();
13617     np->set_last_force_op_resend(pending_inc.epoch);
13618     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13619     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13620                                               get_last_committed() + 1));
13621     return true;
13622   } else if (prefix == "osd tier cache-mode") {
13623     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13624     if (err == -EAGAIN)
13625       goto wait;
13626     if (err)
13627       goto reply;
13628     string poolstr;
13629     cmd_getval(cmdmap, "pool", poolstr);
13630     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13631     if (pool_id < 0) {
13632       ss << "unrecognized pool '" << poolstr << "'";
13633       err = -ENOENT;
13634       goto reply;
13635     }
13636     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13637     ceph_assert(p);
13638     if (!p->is_tier()) {
13639       ss << "pool '" << poolstr << "' is not a tier";
13640       err = -EINVAL;
13641       goto reply;
13642     }
13643     string modestr;
13644     cmd_getval(cmdmap, "mode", modestr);
13645     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13646     if (int(mode) < 0) {
13647       ss << "'" << modestr << "' is not a valid cache mode";
13648       err = -EINVAL;
13649       goto reply;
13650     }
13651
13652     bool sure = false;
13653     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13654
13655     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13656         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13657       ss << "'" << modestr << "' is no longer a supported cache mode";
13658       err = -EPERM;
13659       goto reply;
13660     }
13661     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13662          mode != pg_pool_t::CACHEMODE_NONE &&
13663          mode != pg_pool_t::CACHEMODE_PROXY &&
13664          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13665          !sure) {
13666       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13667          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13668       err = -EPERM;
13669       goto reply;
13670     }
13671
13672     // pool already has this cache-mode set and there are no pending changes
13673     if (p->cache_mode == mode &&
13674         (pending_inc.new_pools.count(pool_id) == 0 ||
13675          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13676       ss << "set cache-mode for pool '" << poolstr << "'"
13677          << " to " << pg_pool_t::get_cache_mode_name(mode);
13678       err = 0;
13679       goto reply;
13680     }
13681
13682     /* Mode description:
13683      *
13684      *  none:       No cache-mode defined
13685      *  forward:    Forward all reads and writes to base pool [removed]
13686      *  writeback:  Cache writes, promote reads from base pool
13687      *  readonly:   Forward writes to base pool
13688      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13689      *  proxy:       Proxy all reads and writes to base pool
13690      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13691      *
13692      * Hence, these are the allowed transitions:
13693      *
13694      *  none -> any
13695      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13696      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13697      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13698      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13699      *  writeback -> readproxy || proxy
13700      *  readonly -> any
13701      */
13702
13703     // We check if the transition is valid against the current pool mode, as
13704     // it is the only committed state thus far.  We will blantly squash
13705     // whatever mode is on the pending state.
13706
13707     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13708         (mode != pg_pool_t::CACHEMODE_PROXY &&
13709           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13710       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13711          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13712          << "' pool; only '"
13713          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13714          << "','"
13715          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13716         << "' allowed.";
13717       err = -EINVAL;
13718       goto reply;
13719     }
13720     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13721         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13722           mode != pg_pool_t::CACHEMODE_PROXY &&
13723           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13724
13725         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13726         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13727           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13728
13729         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13730         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13731           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13732
13733         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13734         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13735           mode != pg_pool_t::CACHEMODE_PROXY &&
13736           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13737
13738       const pool_stat_t* pstats =
13739         mon.mgrstatmon()->get_pool_stat(pool_id);
13740
13741       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13742         ss << "unable to set cache-mode '"
13743            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13744            << "': dirty objects found";
13745         err = -EBUSY;
13746         goto reply;
13747       }
13748     }
13749     // go
13750     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13751     np->cache_mode = mode;
13752     // set this both when moving to and from cache_mode NONE.  this is to
13753     // capture legacy pools that were set up before this flag existed.
13754     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13755     ss << "set cache-mode for pool '" << poolstr
13756         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13757     if (mode == pg_pool_t::CACHEMODE_NONE) {
13758       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13759       ceph_assert(base_pool);
13760       if (base_pool->read_tier == pool_id ||
13761           base_pool->write_tier == pool_id)
13762         ss <<" (WARNING: pool is still configured as read or write tier)";
13763     }
13764     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13765                                               get_last_committed() + 1));
13766     return true;
13767   } else if (prefix == "osd tier add-cache") {
13768     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13769     if (err == -EAGAIN)
13770       goto wait;
13771     if (err)
13772       goto reply;
13773     string poolstr;
13774     cmd_getval(cmdmap, "pool", poolstr);
13775     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13776     if (pool_id < 0) {
13777       ss << "unrecognized pool '" << poolstr << "'";
13778       err = -ENOENT;
13779       goto reply;
13780     }
13781     string tierpoolstr;
13782     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13783     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13784     if (tierpool_id < 0) {
13785       ss << "unrecognized pool '" << tierpoolstr << "'";
13786       err = -ENOENT;
13787       goto reply;
13788     }
13789     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13790     ceph_assert(p);
13791     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13792     ceph_assert(tp);
13793
13794     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13795       goto reply;
13796     }
13797
13798     int64_t size = 0;
13799     if (!cmd_getval(cmdmap, "size", size)) {
13800       ss << "unable to parse 'size' value '"
13801          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13802       err = -EINVAL;
13803       goto reply;
13804     }
13805     // make sure new tier is empty
13806     const pool_stat_t *pstats =
13807       mon.mgrstatmon()->get_pool_stat(tierpool_id);
13808     if (pstats && pstats->stats.sum.num_objects != 0) {
13809       ss << "tier pool '" << tierpoolstr << "' is not empty";
13810       err = -ENOTEMPTY;
13811       goto reply;
13812     }
13813     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13814     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13815     if (int(mode) < 0) {
13816       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13817       err = -EINVAL;
13818       goto reply;
13819     }
13820     HitSet::Params hsp;
13821     auto& cache_hit_set_type =
13822       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13823     if (cache_hit_set_type == "bloom") {
13824       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13825       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13826       hsp = HitSet::Params(bsp);
13827     } else if (cache_hit_set_type == "explicit_hash") {
13828       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13829     } else if (cache_hit_set_type == "explicit_object") {
13830       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13831     } else {
13832       ss << "osd tier cache default hit set type '"
13833          << cache_hit_set_type << "' is not a known type";
13834       err = -EINVAL;
13835       goto reply;
13836     }
13837     // go
13838     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13839     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13840     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13841       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13842       return true;
13843     }
13844     np->tiers.insert(tierpool_id);
13845     np->read_tier = np->write_tier = tierpool_id;
13846     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13847     np->set_last_force_op_resend(pending_inc.epoch);
13848     ntp->set_last_force_op_resend(pending_inc.epoch);
13849     ntp->tier_of = pool_id;
13850     ntp->cache_mode = mode;
13851     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13852     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13853     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13854     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13855     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13856     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13857     ntp->hit_set_params = hsp;
13858     ntp->target_max_bytes = size;
13859     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13860     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13861                                               get_last_committed() + 1));
13862     return true;
13863   } else if (prefix == "osd pool set-quota") {
13864     string poolstr;
13865     cmd_getval(cmdmap, "pool", poolstr);
13866     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13867     if (pool_id < 0) {
13868       ss << "unrecognized pool '" << poolstr << "'";
13869       err = -ENOENT;
13870       goto reply;
13871     }
13872
13873     string field;
13874     cmd_getval(cmdmap, "field", field);
13875     if (field != "max_objects" && field != "max_bytes") {
13876       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13877       err = -EINVAL;
13878       goto reply;
13879     }
13880
13881     // val could contain unit designations, so we treat as a string
13882     string val;
13883     cmd_getval(cmdmap, "val", val);
13884     string tss;
13885     int64_t value;
13886     if (field == "max_objects") {
13887       value = strict_si_cast<uint64_t>(val, &tss);
13888     } else if (field == "max_bytes") {
13889       value = strict_iecstrtoll(val, &tss);
13890     } else {
13891       ceph_abort_msg("unrecognized option");
13892     }
13893     if (!tss.empty()) {
13894       ss << "error parsing value '" << val << "': " << tss;
13895       err = -EINVAL;
13896       goto reply;
13897     }
13898
13899     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13900     if (field == "max_objects") {
13901       pi->quota_max_objects = value;
13902     } else if (field == "max_bytes") {
13903       pi->quota_max_bytes = value;
13904     } else {
13905       ceph_abort_msg("unrecognized option");
13906     }
13907     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13908     rs = ss.str();
13909     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13910                                               get_last_committed() + 1));
13911     return true;
13912   } else if (prefix == "osd pool application enable" ||
13913              prefix == "osd pool application disable" ||
13914              prefix == "osd pool application set" ||
13915              prefix == "osd pool application rm") {
13916     err = prepare_command_pool_application(prefix, cmdmap, ss);
13917     if (err == -EAGAIN) {
13918       goto wait;
13919     } else if (err < 0) {
13920       goto reply;
13921     } else {
13922       goto update;
13923     }
13924   } else if (prefix == "osd force-create-pg") {
13925     pg_t pgid;
13926     string pgidstr;
13927     err = parse_pgid(cmdmap, ss, pgid, pgidstr);
13928     if (err < 0)
13929       goto reply;
13930     bool sure = false;
13931     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13932     if (!sure) {
13933       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13934          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13935          << "only if you are certain that all copies of the PG are in fact lost and you are "
13936          << "willing to accept that the data is permanently destroyed.  Pass "
13937          << "--yes-i-really-mean-it to proceed.";
13938       err = -EPERM;
13939       goto reply;
13940     }
13941     bool creating_now;
13942     {
13943       std::lock_guard<std::mutex> l(creating_pgs_lock);
13944       auto emplaced = creating_pgs.pgs.emplace(
13945         pgid,
13946         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13947                                        ceph_clock_now()));
13948       creating_now = emplaced.second;
13949     }
13950     if (creating_now) {
13951       ss << "pg " << pgidstr << " now creating, ok";
13952       // set the pool's CREATING flag so that (1) the osd won't ignore our
13953       // create message and (2) we won't propose any future pg_num changes
13954       // until after the PG has been instantiated.
13955       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13956         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13957       }
13958       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13959       err = 0;
13960       goto update;
13961     } else {
13962       ss << "pg " << pgid << " already creating";
13963       err = 0;
13964       goto reply;
13965     }
13966   } else if (prefix == "osd force_healthy_stretch_mode") {
13967     bool sure = false;
13968     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13969     if (!sure) {
13970       ss << "This command will require peering across multiple CRUSH buckets "
13971         "(probably two data centers or availability zones?) and may result in PGs "
13972         "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13973       err = -EPERM;
13974       goto reply;
13975     }
13976     try_end_recovery_stretch_mode(true);
13977     ss << "Triggering healthy stretch mode";
13978     err = 0;
13979     goto reply;
13980   } else if (prefix == "osd force_recovery_stretch_mode") {
13981     bool sure = false;
13982     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13983     if (!sure) {
13984       ss << "This command will increase pool sizes to try and spread them "
13985         "across multiple CRUSH buckets (probably two data centers or "
13986         "availability zones?) and should have happened automatically"
13987         "Pass --yes-i-really-mean-it to proceed.";
13988       err = -EPERM;
13989       goto reply;
13990     }
13991     mon.go_recovery_stretch_mode();
13992     ss << "Triggering recovery stretch mode";
13993     err = 0;
13994     goto reply;
13995   } else if (prefix == "osd set-allow-crimson") {
13996
13997     bool sure = false;
13998     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13999
14000     bool experimental_enabled =
14001       g_ceph_context->check_experimental_feature_enabled("crimson");
14002     if (!sure || !experimental_enabled) {
14003       ss << "This command will allow usage of crimson-osd osd daemons.  "
14004          << "crimson-osd is not considered stable and will likely cause "
14005          << "crashes or data corruption.  At this time, crimson-osd is mainly "
14006          << "useful for performance evaluation, testing, and development.  "
14007          << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14008          << "the experimental features config.  This setting is irrevocable.";
14009       err = -EPERM;
14010       goto reply;
14011     }
14012
14013     err = 0;
14014     if (osdmap.get_allow_crimson()) {
14015       goto reply;
14016     } else {
14017       pending_inc.set_allow_crimson();
14018       goto update;
14019     }
14020   } else {
14021     err = -EINVAL;
14022   }
14023
14024  reply:
14025   getline(ss, rs);
14026   if (err < 0 && rs.length() == 0)
14027     rs = cpp_strerror(err);
14028   mon.reply_command(op, err, rs, rdata, get_last_committed());
14029   return ret;
14030
14031  update:
14032   getline(ss, rs);
14033   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
14034                                             get_last_committed() + 1));
14035   return true;
14036
14037  wait:
14038   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14039   return true;
14040 }
14041
14042 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
14043 {
14044   op->mark_osdmon_event(__func__);
14045
14046   auto m = op->get_req<MPoolOp>();
14047   MonSession *session = op->get_session();
14048   if (!session) {
14049     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14050     return true;
14051   }
14052
14053   switch (m->op) {
14054   case POOL_OP_CREATE_UNMANAGED_SNAP:
14055   case POOL_OP_DELETE_UNMANAGED_SNAP:
14056     {
14057       const std::string* pool_name = nullptr;
14058       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
14059       if (pg_pool != nullptr) {
14060         pool_name = &osdmap.get_pool_name(m->pool);
14061       }
14062
14063       if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
14064                                           session->entity_name, session->caps,
14065                                           session->get_peer_socket_addr(),
14066                                           pool_name)) {
14067         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14068                 << "privileges. message: " << *m  << std::endl
14069                 << "caps: " << session->caps << dendl;
14070         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14071         return true;
14072       }
14073     }
14074     break;
14075   default:
14076     if (!session->is_capable("osd", MON_CAP_W)) {
14077       dout(0) << "got pool op from entity with insufficient privileges. "
14078               << "message: " << *m  << std::endl
14079               << "caps: " << session->caps << dendl;
14080       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14081       return true;
14082     }
14083     break;
14084   }
14085
14086   return false;
14087 }
14088
14089 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
14090 {
14091   op->mark_osdmon_event(__func__);
14092   auto m = op->get_req<MPoolOp>();
14093
14094   if (enforce_pool_op_caps(op)) {
14095     return true;
14096   }
14097
14098   if (m->fsid != mon.monmap->fsid) {
14099     dout(0) << __func__ << " drop message on fsid " << m->fsid
14100             << " != " << mon.monmap->fsid << " for " << *m << dendl;
14101     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14102     return true;
14103   }
14104
14105   if (m->op == POOL_OP_CREATE)
14106     return preprocess_pool_op_create(op);
14107
14108   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
14109   if (p == nullptr) {
14110     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
14111     if (m->op == POOL_OP_DELETE) {
14112       _pool_op_reply(op, 0, osdmap.get_epoch());
14113     } else {
14114       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14115     }
14116     return true;
14117   }
14118
14119   // check if the snap and snapname exist
14120   bool snap_exists = false;
14121   if (p->snap_exists(m->name.c_str()))
14122     snap_exists = true;
14123
14124   switch (m->op) {
14125   case POOL_OP_CREATE_SNAP:
14126     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
14127       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14128       return true;
14129     }
14130     if (snap_exists) {
14131       _pool_op_reply(op, 0, osdmap.get_epoch());
14132       return true;
14133     }
14134     return false;
14135   case POOL_OP_CREATE_UNMANAGED_SNAP:
14136     if (p->is_pool_snaps_mode()) {
14137       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14138       return true;
14139     }
14140     return false;
14141   case POOL_OP_DELETE_SNAP:
14142     if (p->is_unmanaged_snaps_mode()) {
14143       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14144       return true;
14145     }
14146     if (!snap_exists) {
14147       _pool_op_reply(op, 0, osdmap.get_epoch());
14148       return true;
14149     }
14150     return false;
14151   case POOL_OP_DELETE_UNMANAGED_SNAP:
14152     if (p->is_pool_snaps_mode()) {
14153       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14154       return true;
14155     }
14156     if (_is_removed_snap(m->pool, m->snapid)) {
14157       _pool_op_reply(op, 0, osdmap.get_epoch());
14158       return true;
14159     }
14160     return false;
14161   case POOL_OP_DELETE:
14162     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
14163       _pool_op_reply(op, 0, osdmap.get_epoch());
14164       return true;
14165     }
14166     return false;
14167   case POOL_OP_AUID_CHANGE:
14168     return false;
14169   default:
14170     ceph_abort();
14171     break;
14172   }
14173
14174   return false;
14175 }
14176
14177 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
14178 {
14179   if (!osdmap.have_pg_pool(pool)) {
14180     dout(10) << __func__ << " pool " << pool << " snap " << snap
14181              << " - pool dne" << dendl;
14182     return true;
14183   }
14184   if (osdmap.in_removed_snaps_queue(pool, snap)) {
14185     dout(10) << __func__ << " pool " << pool << " snap " << snap
14186              << " - in osdmap removed_snaps_queue" << dendl;
14187     return true;
14188   }
14189   snapid_t begin, end;
14190   int r = lookup_purged_snap(pool, snap, &begin, &end);
14191   if (r == 0) {
14192     dout(10) << __func__ << " pool " << pool << " snap " << snap
14193              << " - purged, [" << begin << "," << end << ")" << dendl;
14194     return true;
14195   }
14196   return false;
14197 }
14198
14199 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14200 {
14201   if (pending_inc.old_pools.count(pool)) {
14202     dout(10) << __func__ << " pool " << pool << " snap " << snap
14203              << " - pool pending deletion" << dendl;
14204     return true;
14205   }
14206   if (pending_inc.in_new_removed_snaps(pool, snap)) {
14207     dout(10) << __func__ << " pool " << pool << " snap " << snap
14208              << " - in pending new_removed_snaps" << dendl;
14209     return true;
14210   }
14211   return false;
14212 }
14213
14214 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14215 {
14216   op->mark_osdmon_event(__func__);
14217   auto m = op->get_req<MPoolOp>();
14218   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14219   if (pool >= 0) {
14220     _pool_op_reply(op, 0, osdmap.get_epoch());
14221     return true;
14222   }
14223
14224   return false;
14225 }
14226
14227 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14228 {
14229   op->mark_osdmon_event(__func__);
14230   auto m = op->get_req<MPoolOp>();
14231   dout(10) << "prepare_pool_op " << *m << dendl;
14232   if (m->op == POOL_OP_CREATE) {
14233     return prepare_pool_op_create(op);
14234   } else if (m->op == POOL_OP_DELETE) {
14235     return prepare_pool_op_delete(op);
14236   }
14237
14238   int ret = 0;
14239   bool changed = false;
14240
14241   if (!osdmap.have_pg_pool(m->pool)) {
14242     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14243     return false;
14244   }
14245
14246   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14247
14248   if (m->op == POOL_OP_CREATE_SNAP ||
14249       m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
14250     if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
14251       dout(20) << "monitor-managed snapshots have been disabled for pools "
14252                   " attached to an fs - pool:" << m->pool << dendl;
14253       _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14254       return false;
14255     }
14256   }
14257
14258   switch (m->op) {
14259     case POOL_OP_CREATE_SNAP:
14260       if (pool->is_tier()) {
14261         ret = -EINVAL;
14262         _pool_op_reply(op, ret, osdmap.get_epoch());
14263         return false;
14264       }  // else, fall through
14265     case POOL_OP_DELETE_SNAP:
14266       if (!pool->is_unmanaged_snaps_mode()) {
14267         bool snap_exists = pool->snap_exists(m->name.c_str());
14268         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14269           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14270           ret = 0;
14271         } else {
14272           break;
14273         }
14274       } else {
14275         ret = -EINVAL;
14276       }
14277       _pool_op_reply(op, ret, osdmap.get_epoch());
14278       return false;
14279
14280     case POOL_OP_DELETE_UNMANAGED_SNAP:
14281       // we won't allow removal of an unmanaged snapshot from a pool
14282       // not in unmanaged snaps mode.
14283       if (!pool->is_unmanaged_snaps_mode()) {
14284         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14285         return false;
14286       }
14287       /* fall-thru */
14288     case POOL_OP_CREATE_UNMANAGED_SNAP:
14289       // but we will allow creating an unmanaged snapshot on any pool
14290       // as long as it is not in 'pool' snaps mode.
14291       if (pool->is_pool_snaps_mode()) {
14292         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14293         return false;
14294       }
14295   }
14296
14297   // projected pool info
14298   pg_pool_t pp;
14299   if (pending_inc.new_pools.count(m->pool))
14300     pp = pending_inc.new_pools[m->pool];
14301   else
14302     pp = *osdmap.get_pg_pool(m->pool);
14303
14304   bufferlist reply_data;
14305
14306   // pool snaps vs unmanaged snaps are mutually exclusive
14307   switch (m->op) {
14308   case POOL_OP_CREATE_SNAP:
14309   case POOL_OP_DELETE_SNAP:
14310     if (pp.is_unmanaged_snaps_mode()) {
14311       ret = -EINVAL;
14312       goto out;
14313     }
14314     break;
14315
14316   case POOL_OP_CREATE_UNMANAGED_SNAP:
14317   case POOL_OP_DELETE_UNMANAGED_SNAP:
14318     if (pp.is_pool_snaps_mode()) {
14319       ret = -EINVAL;
14320       goto out;
14321     }
14322   }
14323
14324   switch (m->op) {
14325   case POOL_OP_CREATE_SNAP:
14326     if (!pp.snap_exists(m->name.c_str())) {
14327       pp.add_snap(m->name.c_str(), ceph_clock_now());
14328       dout(10) << "create snap in pool " << m->pool << " " << m->name
14329                << " seq " << pp.get_snap_epoch() << dendl;
14330       changed = true;
14331     }
14332     break;
14333
14334   case POOL_OP_DELETE_SNAP:
14335     {
14336       snapid_t s = pp.snap_exists(m->name.c_str());
14337       if (s) {
14338         pp.remove_snap(s);
14339         pending_inc.new_removed_snaps[m->pool].insert(s);
14340         changed = true;
14341       }
14342     }
14343     break;
14344
14345   case POOL_OP_CREATE_UNMANAGED_SNAP:
14346     {
14347       uint64_t snapid = pp.add_unmanaged_snap(
14348         osdmap.require_osd_release < ceph_release_t::octopus);
14349       encode(snapid, reply_data);
14350       changed = true;
14351     }
14352     break;
14353
14354   case POOL_OP_DELETE_UNMANAGED_SNAP:
14355     if (!_is_removed_snap(m->pool, m->snapid) &&
14356         !_is_pending_removed_snap(m->pool, m->snapid)) {
14357       if (m->snapid > pp.get_snap_seq()) {
14358         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14359         return false;
14360       }
14361       pp.remove_unmanaged_snap(
14362         m->snapid,
14363         osdmap.require_osd_release < ceph_release_t::octopus);
14364       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14365       // also record the new seq as purged: this avoids a discontinuity
14366       // after all of the snaps have been purged, since the seq assigned
14367       // during removal lives in the same namespace as the actual snaps.
14368       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14369       changed = true;
14370     }
14371     break;
14372
14373   case POOL_OP_AUID_CHANGE:
14374     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14375     return false;
14376
14377   default:
14378     ceph_abort();
14379     break;
14380   }
14381
14382   if (changed) {
14383     pp.set_snap_epoch(pending_inc.epoch);
14384     pending_inc.new_pools[m->pool] = pp;
14385   }
14386
14387  out:
14388   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14389   return true;
14390 }
14391
14392 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14393 {
14394   op->mark_osdmon_event(__func__);
14395   int err = prepare_new_pool(op);
14396   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14397   return true;
14398 }
14399
14400 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14401                                    ostream *ss)
14402 {
14403   const string& poolstr = osdmap.get_pool_name(pool_id);
14404
14405   // If the Pool is in use by CephFS, refuse to delete it
14406   FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14407   if (pending_fsmap.pool_in_use(pool_id)) {
14408     *ss << "pool '" << poolstr << "' is in use by CephFS";
14409     return -EBUSY;
14410   }
14411
14412   if (pool.tier_of >= 0) {
14413     *ss << "pool '" << poolstr << "' is a tier of '"
14414         << osdmap.get_pool_name(pool.tier_of) << "'";
14415     return -EBUSY;
14416   }
14417   if (!pool.tiers.empty()) {
14418     *ss << "pool '" << poolstr << "' has tiers";
14419     for(auto tier : pool.tiers) {
14420       *ss << " " << osdmap.get_pool_name(tier);
14421     }
14422     return -EBUSY;
14423   }
14424
14425   if (!g_conf()->mon_allow_pool_delete) {
14426     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14427     return -EPERM;
14428   }
14429
14430   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14431     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14432     return -EPERM;
14433   }
14434
14435   *ss << "pool '" << poolstr << "' removed";
14436   return 0;
14437 }
14438
14439 /**
14440  * Check if it is safe to add a tier to a base pool
14441  *
14442  * @return
14443  * True if the operation should proceed, false if we should abort here
14444  * (abort doesn't necessarily mean error, could be idempotency)
14445  */
14446 bool OSDMonitor::_check_become_tier(
14447     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14448     const int64_t base_pool_id, const pg_pool_t *base_pool,
14449     int *err,
14450     ostream *ss) const
14451 {
14452   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14453   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14454
14455   if (tier_pool->is_crimson()) {
14456     *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
14457         << "features are not supported";
14458     *err = -EINVAL;
14459     return false;
14460   }
14461   if (base_pool->is_crimson()) {
14462     *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
14463         << "features are not supported";
14464     *err = -EINVAL;
14465     return false;
14466   }
14467
14468   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14469   if (pending_fsmap.pool_in_use(tier_pool_id)) {
14470     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14471     *err = -EBUSY;
14472     return false;
14473   }
14474
14475   if (base_pool->tiers.count(tier_pool_id)) {
14476     ceph_assert(tier_pool->tier_of == base_pool_id);
14477     *err = 0;
14478     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14479       << base_pool_name << "'";
14480     return false;
14481   }
14482
14483   if (base_pool->is_tier()) {
14484     *ss << "pool '" << base_pool_name << "' is already a tier of '"
14485       << osdmap.get_pool_name(base_pool->tier_of) << "', "
14486       << "multiple tiers are not yet supported.";
14487     *err = -EINVAL;
14488     return false;
14489   }
14490
14491   if (tier_pool->has_tiers()) {
14492     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14493     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14494          it != tier_pool->tiers.end(); ++it)
14495       *ss << "'" << osdmap.get_pool_name(*it) << "',";
14496     *ss << " multiple tiers are not yet supported.";
14497     *err = -EINVAL;
14498     return false;
14499   }
14500
14501   if (tier_pool->is_tier()) {
14502     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14503        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14504     *err = -EINVAL;
14505     return false;
14506   }
14507
14508   *err = 0;
14509   return true;
14510 }
14511
14512
14513 /**
14514  * Check if it is safe to remove a tier from this base pool
14515  *
14516  * @return
14517  * True if the operation should proceed, false if we should abort here
14518  * (abort doesn't necessarily mean error, could be idempotency)
14519  */
14520 bool OSDMonitor::_check_remove_tier(
14521     const int64_t base_pool_id, const pg_pool_t *base_pool,
14522     const pg_pool_t *tier_pool,
14523     int *err, ostream *ss) const
14524 {
14525   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14526
14527   // Apply CephFS-specific checks
14528   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14529   if (pending_fsmap.pool_in_use(base_pool_id)) {
14530     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14531       // If the underlying pool is erasure coded and does not allow EC
14532       // overwrites, we can't permit the removal of the replicated tier that
14533       // CephFS relies on to access it
14534       *ss << "pool '" << base_pool_name <<
14535           "' does not allow EC overwrites and is in use by CephFS"
14536           " via its tier";
14537       *err = -EBUSY;
14538       return false;
14539     }
14540
14541     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14542       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14543              "tier is still in use as a writeback cache.  Change the cache "
14544              "mode and flush the cache before removing it";
14545       *err = -EBUSY;
14546       return false;
14547     }
14548   }
14549
14550   *err = 0;
14551   return true;
14552 }
14553
14554 int OSDMonitor::_prepare_remove_pool(
14555   int64_t pool, ostream *ss, bool no_fake)
14556 {
14557   dout(10) << __func__ << " " << pool << dendl;
14558   const pg_pool_t *p = osdmap.get_pg_pool(pool);
14559   int r = _check_remove_pool(pool, *p, ss);
14560   if (r < 0)
14561     return r;
14562
14563   auto new_pool = pending_inc.new_pools.find(pool);
14564   if (new_pool != pending_inc.new_pools.end()) {
14565     // if there is a problem with the pending info, wait and retry
14566     // this op.
14567     const auto& p = new_pool->second;
14568     int r = _check_remove_pool(pool, p, ss);
14569     if (r < 0)
14570       return -EAGAIN;
14571   }
14572
14573   if (pending_inc.old_pools.count(pool)) {
14574     dout(10) << __func__ << " " << pool << " already pending removal"
14575              << dendl;
14576     return 0;
14577   }
14578
14579   if (g_conf()->mon_fake_pool_delete && !no_fake) {
14580     string old_name = osdmap.get_pool_name(pool);
14581     string new_name = old_name + "." + stringify(pool) + ".DELETED";
14582     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14583             << old_name << " -> " << new_name << dendl;
14584     pending_inc.new_pool_names[pool] = new_name;
14585     return 0;
14586   }
14587
14588   // remove
14589   pending_inc.old_pools.insert(pool);
14590
14591   // remove any pg_temp mappings for this pool
14592   for (auto p = osdmap.pg_temp->begin();
14593        p != osdmap.pg_temp->end();
14594        ++p) {
14595     if (p->first.pool() == pool) {
14596       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14597                << p->first << dendl;
14598       pending_inc.new_pg_temp[p->first].clear();
14599     }
14600   }
14601   // remove any primary_temp mappings for this pool
14602   for (auto p = osdmap.primary_temp->begin();
14603       p != osdmap.primary_temp->end();
14604       ++p) {
14605     if (p->first.pool() == pool) {
14606       dout(10) << __func__ << " " << pool
14607                << " removing obsolete primary_temp" << p->first << dendl;
14608       pending_inc.new_primary_temp[p->first] = -1;
14609     }
14610   }
14611   // remove any pg_upmap mappings for this pool
14612   for (auto& p : osdmap.pg_upmap) {
14613     if (p.first.pool() == pool) {
14614       dout(10) << __func__ << " " << pool
14615                << " removing obsolete pg_upmap "
14616                << p.first << dendl;
14617       pending_inc.old_pg_upmap.insert(p.first);
14618     }
14619   }
14620   // remove any pending pg_upmap mappings for this pool
14621   {
14622     auto it = pending_inc.new_pg_upmap.begin();
14623     while (it != pending_inc.new_pg_upmap.end()) {
14624       if (it->first.pool() == pool) {
14625         dout(10) << __func__ << " " << pool
14626                  << " removing pending pg_upmap "
14627                  << it->first << dendl;
14628         it = pending_inc.new_pg_upmap.erase(it);
14629       } else {
14630         it++;
14631       }
14632     }
14633   }
14634   // remove any pg_upmap_items mappings for this pool
14635   for (auto& p : osdmap.pg_upmap_items) {
14636     if (p.first.pool() == pool) {
14637       dout(10) << __func__ << " " << pool
14638                << " removing obsolete pg_upmap_items " << p.first
14639                << dendl;
14640       pending_inc.old_pg_upmap_items.insert(p.first);
14641     }
14642   }
14643   // remove any pending pg_upmap mappings for this pool
14644   {
14645     auto it = pending_inc.new_pg_upmap_items.begin();
14646     while (it != pending_inc.new_pg_upmap_items.end()) {
14647       if (it->first.pool() == pool) {
14648         dout(10) << __func__ << " " << pool
14649                  << " removing pending pg_upmap_items "
14650                  << it->first << dendl;
14651         it = pending_inc.new_pg_upmap_items.erase(it);
14652       } else {
14653         it++;
14654       }
14655     }
14656   }
14657
14658   // remove any choose_args for this pool
14659   CrushWrapper newcrush = _get_pending_crush();
14660   if (newcrush.have_choose_args(pool)) {
14661     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14662     newcrush.rm_choose_args(pool);
14663     pending_inc.crush.clear();
14664     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14665   }
14666   return 0;
14667 }
14668
14669 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14670 {
14671   dout(10) << "_prepare_rename_pool " << pool << dendl;
14672   if (pending_inc.old_pools.count(pool)) {
14673     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14674     return -ENOENT;
14675   }
14676   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14677        p != pending_inc.new_pool_names.end();
14678        ++p) {
14679     if (p->second == newname && p->first != pool) {
14680       return -EEXIST;
14681     }
14682   }
14683
14684   pending_inc.new_pool_names[pool] = newname;
14685   return 0;
14686 }
14687
14688 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14689 {
14690   op->mark_osdmon_event(__func__);
14691   auto m = op->get_req<MPoolOp>();
14692   ostringstream ss;
14693   int ret = _prepare_remove_pool(m->pool, &ss, false);
14694   if (ret == -EAGAIN) {
14695     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14696     return true;
14697   }
14698   if (ret < 0)
14699     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14700   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14701                                                       pending_inc.epoch));
14702   return true;
14703 }
14704
14705 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14706                                 int ret, epoch_t epoch, bufferlist *blp)
14707 {
14708   op->mark_osdmon_event(__func__);
14709   auto m = op->get_req<MPoolOp>();
14710   dout(20) << "_pool_op_reply " << ret << dendl;
14711   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14712                                          ret, epoch, get_last_committed(), blp);
14713   mon.send_reply(op, reply);
14714 }
14715
14716 void OSDMonitor::convert_pool_priorities(void)
14717 {
14718   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14719   int64_t max_prio = 0;
14720   int64_t min_prio = 0;
14721   for (const auto &i : osdmap.get_pools()) {
14722     const auto &pool = i.second;
14723
14724     if (pool.opts.is_set(key)) {
14725       int64_t prio = 0;
14726       pool.opts.get(key, &prio);
14727       if (prio > max_prio)
14728         max_prio = prio;
14729       if (prio < min_prio)
14730         min_prio = prio;
14731     }
14732   }
14733   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14734     dout(20) << __func__ << " nothing to fix" << dendl;
14735     return;
14736   }
14737   // Current pool priorities exceeds new maximum
14738   for (const auto &i : osdmap.get_pools()) {
14739     const auto pool_id = i.first;
14740     pg_pool_t pool = i.second;
14741
14742     int64_t prio = 0;
14743     pool.opts.get(key, &prio);
14744     int64_t n;
14745
14746     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14747       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14748       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14749     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14750       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14751       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14752     } else {
14753       continue;
14754     }
14755     if (n == 0) {
14756       pool.opts.unset(key);
14757     } else {
14758       pool.opts.set(key, static_cast<int64_t>(n));
14759     }
14760     dout(10) << __func__ << " pool " << pool_id
14761              << " recovery_priority adjusted "
14762              << prio << " to " << n << dendl;
14763     pool.last_change = pending_inc.epoch;
14764     pending_inc.new_pools[pool_id] = pool;
14765   }
14766 }
14767
14768 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14769                                                int *errcode,
14770                                                set<pg_pool_t*>* pools,
14771                                                const string& new_crush_rule)
14772 {
14773   dout(20) << __func__ << dendl;
14774   *okay = false;
14775   int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14776   if (new_crush_rule_result < 0) {
14777     ss << "unrecognized crush rule " << new_crush_rule_result;
14778     *errcode = new_crush_rule_result;
14779     return;
14780   }
14781   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14782   for (const auto& pooli : osdmap.pools) {
14783     int64_t poolid = pooli.first;
14784     const pg_pool_t *p = &pooli.second;
14785     if (!p->is_replicated()) {
14786       ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14787       *errcode = -EINVAL;
14788       return;
14789     }
14790     uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14791     if ((p->get_size() != default_size ||
14792          (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14793         (p->get_crush_rule() != new_rule)) {
14794       ss << "we currently require stretch mode pools start out with the"
14795         " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14796       *errcode = -EINVAL;
14797       return;
14798     }
14799     pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14800     // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14801     // the attempt may fail and then we have these pool updates...but they won't do anything
14802     // if there is a failure, so if it's hard to change the interface, no need to bother
14803     pools->insert(pp);
14804   }
14805   *okay = true;
14806   return;
14807 }
14808
14809 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14810                                          int *errcode, bool commit,
14811                                          const string& dividing_bucket,
14812                                          uint32_t bucket_count,
14813                                          const set<pg_pool_t*>& pools,
14814                                          const string& new_crush_rule)
14815 {
14816   dout(20) << __func__ << dendl;
14817   *okay = false;
14818   CrushWrapper crush = _get_pending_crush();
14819   int dividing_id = -1;
14820   if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14821       !type_id.has_value()) {
14822     ss << dividing_bucket << " is not a valid crush bucket type";
14823     *errcode = -ENOENT;
14824     ceph_assert(!commit);
14825     return;
14826   } else {
14827     dividing_id = *type_id;
14828   }
14829   vector<int> subtrees;
14830   crush.get_subtree_of_type(dividing_id, &subtrees);
14831   if (subtrees.size() != 2) {
14832     ss << "there are " << subtrees.size() << dividing_bucket
14833        << "'s in the cluster but stretch mode currently only works with 2!";
14834     *errcode = -EINVAL;
14835     ceph_assert(!commit || subtrees.size() == 2);
14836     return;
14837   }
14838
14839   int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14840   if (new_crush_rule_result < 0) {
14841     ss << "unrecognized crush rule " << new_crush_rule;
14842     *errcode = new_crush_rule_result;
14843     ceph_assert(!commit || (new_crush_rule_result > 0));
14844     return;
14845   }
14846   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14847
14848   int weight1 = crush.get_item_weight(subtrees[0]);
14849   int weight2 = crush.get_item_weight(subtrees[1]);
14850   if (weight1 != weight2) {
14851     // TODO: I'm really not sure this is a good idea?
14852     ss << "the 2 " << dividing_bucket
14853        << "instances in the cluster have differing weights "
14854        << weight1 << " and " << weight2
14855        <<" but stretch mode currently requires they be the same!";
14856     *errcode = -EINVAL;
14857     ceph_assert(!commit || (weight1 == weight2));
14858     return;
14859   }
14860   if (bucket_count != 2) {
14861     ss << "currently we only support 2-site stretch clusters!";
14862     *errcode = -EINVAL;
14863     ceph_assert(!commit || bucket_count == 2);
14864     return;
14865   }
14866   // TODO: check CRUSH rules for pools so that we are appropriately divided
14867   if (commit) {
14868     for (auto pool : pools) {
14869       pool->crush_rule = new_rule;
14870       pool->peering_crush_bucket_count = bucket_count;
14871       pool->peering_crush_bucket_target = bucket_count;
14872       pool->peering_crush_bucket_barrier = dividing_id;
14873       pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14874       pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14875       pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14876     }
14877     pending_inc.change_stretch_mode = true;
14878     pending_inc.stretch_mode_enabled = true;
14879     pending_inc.new_stretch_bucket_count = bucket_count;
14880     pending_inc.new_degraded_stretch_mode = 0;
14881     pending_inc.new_stretch_mode_bucket = dividing_id;
14882   }
14883   *okay = true;
14884   return;
14885 }
14886
14887 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14888                                             set<int> *really_down_buckets,
14889                                             set<string> *really_down_mons)
14890 {
14891   dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14892   ceph_assert(is_readable());
14893   if (dead_buckets.empty()) return false;
14894   set<int> down_cache;
14895   bool really_down = false;
14896   for (auto dbi : dead_buckets) {
14897     const string& bucket_name = dbi.first;
14898     ceph_assert(osdmap.crush->name_exists(bucket_name));
14899     int bucket_id = osdmap.crush->get_item_id(bucket_name);
14900     dout(20) << "Checking " << bucket_name << " id " << bucket_id
14901              << " to see if OSDs are also down" << dendl;
14902     bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14903     if (subtree_down) {
14904       dout(20) << "subtree is down!" << dendl;
14905       really_down = true;
14906       really_down_buckets->insert(bucket_id);
14907       really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14908     }
14909   }
14910   dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14911            << " and mons " << *really_down_mons << " are really down" << dendl;
14912   return really_down;
14913 }
14914
14915 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14916                                                const set<string>& live_zones)
14917 {
14918   dout(20) << __func__ << dendl;
14919   stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14920   // update the general OSDMap changes
14921   pending_inc.change_stretch_mode = true;
14922   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14923   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14924   int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14925   ceph_assert(new_site_count == 1); // stretch count 2!
14926   pending_inc.new_degraded_stretch_mode = new_site_count;
14927   pending_inc.new_recovering_stretch_mode = 0;
14928   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14929
14930   // and then apply them to all the pg_pool_ts
14931   ceph_assert(live_zones.size() == 1); // only support 2 zones now
14932   const string& remaining_site_name = *(live_zones.begin());
14933   ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14934   int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14935   for (auto pgi : osdmap.pools) {
14936     if (pgi.second.peering_crush_bucket_count) {
14937       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14938       newp.peering_crush_bucket_count = new_site_count;
14939       newp.peering_crush_mandatory_member = remaining_site;
14940       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14941       newp.set_last_force_op_resend(pending_inc.epoch);
14942     }
14943   }
14944   propose_pending();
14945 }
14946
14947 void OSDMonitor::trigger_recovery_stretch_mode()
14948 {
14949   dout(20) << __func__ << dendl;
14950   stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14951   pending_inc.change_stretch_mode = true;
14952   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14953   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14954   pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14955   pending_inc.new_recovering_stretch_mode = 1;
14956   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14957
14958   for (auto pgi : osdmap.pools) {
14959     if (pgi.second.peering_crush_bucket_count) {
14960       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14961       newp.set_last_force_op_resend(pending_inc.epoch);
14962     }
14963   }
14964   propose_pending();
14965 }
14966
14967 void OSDMonitor::set_degraded_stretch_mode()
14968 {
14969   stretch_recovery_triggered.set_from_double(0);
14970 }
14971
14972 void OSDMonitor::set_recovery_stretch_mode()
14973 {
14974   if (stretch_recovery_triggered.is_zero()) {
14975     stretch_recovery_triggered = ceph_clock_now();
14976   }
14977 }
14978
14979 void OSDMonitor::set_healthy_stretch_mode()
14980 {
14981   stretch_recovery_triggered.set_from_double(0);
14982 }
14983
14984 void OSDMonitor::notify_new_pg_digest()
14985 {
14986   dout(20) << __func__ << dendl;
14987   if (!stretch_recovery_triggered.is_zero()) {
14988     try_end_recovery_stretch_mode(false);
14989   }
14990 }
14991
14992 struct CMonExitRecovery : public Context {
14993   OSDMonitor *m;
14994   bool force;
14995   CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14996   void finish(int r) {
14997     m->try_end_recovery_stretch_mode(force);
14998   }
14999 };
15000
15001 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
15002 {
15003   dout(20) << __func__ << dendl;
15004   if (!mon.is_leader()) return;
15005   if (!mon.is_degraded_stretch_mode()) return;
15006   if (!mon.is_recovering_stretch_mode()) return;
15007   if (!is_readable()) {
15008     wait_for_readable_ctx(new CMonExitRecovery(this, force));
15009     return;
15010   }
15011
15012   if (osdmap.recovering_stretch_mode &&
15013       ((!stretch_recovery_triggered.is_zero() &&
15014         ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
15015         stretch_recovery_triggered) ||
15016        force)) {
15017     if (!mon.mgrstatmon()->is_readable()) {
15018       mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
15019       return;
15020     }
15021     const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
15022     double misplaced, degraded, inactive, unknown;
15023     pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
15024     if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
15025       // we can exit degraded stretch mode!
15026       mon.trigger_healthy_stretch_mode();
15027     }
15028   }
15029 }
15030
15031 void OSDMonitor::trigger_healthy_stretch_mode()
15032 {
15033   ceph_assert(is_writeable());
15034   stretch_recovery_triggered.set_from_double(0);
15035   pending_inc.change_stretch_mode = true;
15036   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
15037   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
15038   pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
15039   pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
15040   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
15041   for (auto pgi : osdmap.pools) {
15042     if (pgi.second.peering_crush_bucket_count) {
15043       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
15044       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
15045       newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
15046       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
15047       newp.set_last_force_op_resend(pending_inc.epoch);
15048     }
15049   }
15050   propose_pending();
15051 }