ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/KVMonitor.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate2.h"
  51 #include "messages/MOSDPGCreated.h"
  52 #include "messages/MOSDPGTemp.h"
  53 #include "messages/MOSDPGReadyToMerge.h"
  54 #include "messages/MMonCommand.h"
  55 #include "messages/MRemoveSnaps.h"
  56 #include "messages/MRoute.h"
  57 #include "messages/MMonGetPurgedSnaps.h"
  58 #include "messages/MMonGetPurgedSnapsReply.h"
  59
  60 #include "common/TextTable.h"
  61 #include "common/Timer.h"
  62 #include "common/ceph_argparse.h"
  63 #include "common/perf_counters.h"
  64 #include "common/PriorityCache.h"
  65 #include "common/strtol.h"
  66 #include "common/numa.h"
  67
  68 #include "common/config.h"
  69 #include "common/errno.h"
  70
  71 #include "erasure-code/ErasureCodePlugin.h"
  72 #include "compressor/Compressor.h"
  73 #include "common/Checksummer.h"
  74
  75 #include "include/compat.h"
  76 #include "include/ceph_assert.h"
  77 #include "include/stringify.h"
  78 #include "include/util.h"
  79 #include "common/cmdparse.h"
  80 #include "include/str_list.h"
  81 #include "include/str_map.h"
  82 #include "include/scope_guard.h"
  83 #include "perfglue/heap_profiler.h"
  84
  85 #include "auth/cephx/CephxKeyServer.h"
  86 #include "osd/OSDCap.h"
  87
  88 #include "json_spirit/json_spirit_reader.h"
  89
  90 #include <boost/algorithm/string/predicate.hpp>
  91
  92 using std::dec;
  93 using std::hex;
  94 using std::list;
  95 using std::map;
  96 using std::make_pair;
  97 using std::ostringstream;
  98 using std::pair;
  99 using std::set;
 100 using std::string;
 101 using std::stringstream;
 102 using std::to_string;
 103 using std::vector;
 104
 105 using ceph::bufferlist;
 106 using ceph::decode;
 107 using ceph::encode;
 108 using ceph::ErasureCodeInterfaceRef;
 109 using ceph::ErasureCodePluginRegistry;
 110 using ceph::ErasureCodeProfile;
 111 using ceph::Formatter;
 112 using ceph::JSONFormatter;
 113 using ceph::make_message;
 114
 115 #define dout_subsys ceph_subsys_mon
 116 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
 117 static const string OSD_METADATA_PREFIX("osd_metadata");
 118 static const string OSD_SNAP_PREFIX("osd_snap");
 119
 120 /*
 121
 122   OSD snapshot metadata
 123   ---------------------
 124
 125   -- starting with mimic, removed in octopus --
 126
 127   "removed_epoch_%llu_%08lx" % (pool, epoch)
 128    -> interval_set<snapid_t>
 129
 130   "removed_snap_%llu_%016llx" % (pool, last_snap)
 131    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 132
 133
 134   -- starting with mimic --
 135
 136   "purged_snap_%llu_%016llx" % (pool, last_snap)
 137    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 138
 139   - note that the {removed,purged}_snap put the last snap in they key so
 140     that we can use forward iteration only to search for an epoch in an
 141     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 142     >= N that either does or doesn't contain the given snap.
 143
 144
 145   -- starting with octopus --
 146
 147   "purged_epoch_%08lx" % epoch
 148   -> map<int64_t,interval_set<snapid_t>>
 149
 150   */
 151 using namespace TOPNSPC::common;
 152 namespace {
 153
 154 struct OSDMemCache : public PriorityCache::PriCache {
 155   OSDMonitor *osdmon;
 156   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 157   int64_t committed_bytes = 0;
 158   double cache_ratio = 0;
 159
 160   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 161
 162   virtual uint64_t _get_used_bytes() const = 0;
 163
 164   virtual int64_t request_cache_bytes(
 165       PriorityCache::Priority pri, uint64_t total_cache) const {
 166     int64_t assigned = get_cache_bytes(pri);
 167
 168     switch (pri) {
 169     // All cache items are currently set to have PRI1 priority
 170     case PriorityCache::Priority::PRI1:
 171       {
 172         int64_t request = _get_used_bytes();
 173         return (request > assigned) ? request - assigned : 0;
 174       }
 175     default:
 176       break;
 177     }
 178     return -EOPNOTSUPP;
 179   }
 180
 181   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 182       return cache_bytes[pri];
 183   }
 184
 185   virtual int64_t get_cache_bytes() const {
 186     int64_t total = 0;
 187
 188     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 189       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 190       total += get_cache_bytes(pri);
 191     }
 192     return total;
 193   }
 194
 195   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 196     cache_bytes[pri] = bytes;
 197   }
 198   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 199     cache_bytes[pri] += bytes;
 200   }
 201   virtual int64_t commit_cache_size(uint64_t total_cache) {
 202     committed_bytes = PriorityCache::get_chunk(
 203         get_cache_bytes(), total_cache);
 204     return committed_bytes;
 205   }
 206   virtual int64_t get_committed_size() const {
 207     return committed_bytes;
 208   }
 209   virtual double get_cache_ratio() const {
 210     return cache_ratio;
 211   }
 212   virtual void set_cache_ratio(double ratio) {
 213     cache_ratio = ratio;
 214   }
 215   virtual void shift_bins() {
 216   }
 217   virtual void import_bins(const std::vector<uint64_t> &bins) {
 218   }
 219   virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
 220   }
 221   virtual uint64_t get_bins(PriorityCache::Priority pri) const {
 222     return 0;
 223   }
 224
 225   virtual string get_cache_name() const = 0;
 226 };
 227
 228 struct IncCache : public OSDMemCache {
 229   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 230
 231   virtual uint64_t _get_used_bytes() const {
 232     return osdmon->inc_osd_cache.get_bytes();
 233   }
 234
 235   virtual string get_cache_name() const {
 236     return "OSDMap Inc Cache";
 237   }
 238
 239   uint64_t _get_num_osdmaps() const {
 240     return osdmon->inc_osd_cache.get_size();
 241   }
 242 };
 243
 244 struct FullCache : public OSDMemCache {
 245   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 246
 247   virtual uint64_t _get_used_bytes() const {
 248     return osdmon->full_osd_cache.get_bytes();
 249   }
 250
 251   virtual string get_cache_name() const {
 252     return "OSDMap Full Cache";
 253   }
 254
 255   uint64_t _get_num_osdmaps() const {
 256     return osdmon->full_osd_cache.get_size();
 257   }
 258 };
 259
 260 std::shared_ptr<IncCache> inc_cache;
 261 std::shared_ptr<FullCache> full_cache;
 262
 263 const uint32_t MAX_POOL_APPLICATIONS = 4;
 264 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 265 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 266
 267 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 268   // Note: this doesn't include support for the application tag match
 269   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 270     auto& match = grant.match;
 271     if (match.is_match_all()) {
 272       return true;
 273     } else if (pool_name != nullptr &&
 274                !match.pool_namespace.pool_name.empty() &&
 275                match.pool_namespace.pool_name == *pool_name) {
 276       return true;
 277     }
 278   }
 279   return false;
 280 }
 281
 282 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 283                                     const KeyServer& key_server,
 284                                     const EntityName& entity_name,
 285                                     const MonCap& mon_caps,
 286                                     const entity_addr_t& peer_socket_addr,
 287                                     const std::string* pool_name)
 288 {
 289   typedef std::map<std::string, std::string> CommandArgs;
 290
 291   if (mon_caps.is_capable(
 292         cct, entity_name, "osd",
 293         "osd pool op unmanaged-snap",
 294         (pool_name == nullptr ?
 295          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 296          CommandArgs{{"poolname", *pool_name}}),
 297         false, true, false,
 298         peer_socket_addr)) {
 299     return true;
 300   }
 301
 302   AuthCapsInfo caps_info;
 303   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 304                                    caps_info)) {
 305     dout(10) << "unable to locate OSD cap data for " << entity_name
 306              << " in auth db" << dendl;
 307     return false;
 308   }
 309
 310   string caps_str;
 311   if (caps_info.caps.length() > 0) {
 312     auto p = caps_info.caps.cbegin();
 313     try {
 314       decode(caps_str, p);
 315     } catch (const ceph::buffer::error &err) {
 316       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 317            << dendl;
 318       return false;
 319     }
 320   }
 321
 322   OSDCap osd_cap;
 323   if (!osd_cap.parse(caps_str, nullptr)) {
 324     dout(10) << "unable to parse OSD cap data for " << entity_name
 325              << " in auth db" << dendl;
 326     return false;
 327   }
 328
 329   // if the entity has write permissions in one or all pools, permit
 330   // usage of unmanaged-snapshots
 331   if (osd_cap.allow_all()) {
 332     return true;
 333   }
 334
 335   for (auto& grant : osd_cap.grants) {
 336     if (grant.profile.is_valid()) {
 337       for (auto& profile_grant : grant.profile_grants) {
 338         if (is_osd_writable(profile_grant, pool_name)) {
 339           return true;
 340         }
 341       }
 342     } else if (is_osd_writable(grant, pool_name)) {
 343       return true;
 344     }
 345   }
 346
 347   return false;
 348 }
 349
 350 } // anonymous namespace
 351
 352 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
 353                                  epoch_t last_epoch_clean)
 354 {
 355   if (ps >= pg_num) {
 356     // removed PG
 357     return;
 358   }
 359   epoch_by_pg.resize(pg_num, 0);
 360   const auto old_lec = epoch_by_pg[ps];
 361   if (old_lec >= last_epoch_clean) {
 362     // stale lec
 363     return;
 364   }
 365   epoch_by_pg[ps] = last_epoch_clean;
 366   if (last_epoch_clean < floor) {
 367     floor = last_epoch_clean;
 368   } else if (last_epoch_clean > floor) {
 369     if (old_lec == floor) {
 370       // probably should increase floor?
 371       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 372                                         std::end(epoch_by_pg));
 373       floor = *new_floor;
 374     }
 375   }
 376   if (ps != next_missing) {
 377     return;
 378   }
 379   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 380     if (epoch_by_pg[next_missing] == 0) {
 381       break;
 382     }
 383   }
 384 }
 385
 386 void LastEpochClean::remove_pool(uint64_t pool)
 387 {
 388   report_by_pool.erase(pool);
 389 }
 390
 391 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
 392                             epoch_t last_epoch_clean)
 393 {
 394   auto& lec = report_by_pool[pg.pool()];
 395   return lec.report(pg_num, pg.ps(), last_epoch_clean);
 396 }
 397
 398 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 399 {
 400   auto floor = latest.get_epoch();
 401   for (auto& pool : latest.get_pools()) {
 402     auto reported = report_by_pool.find(pool.first);
 403     if (reported == report_by_pool.end()) {
 404       return 0;
 405     }
 406     if (reported->second.next_missing < pool.second.get_pg_num()) {
 407       return 0;
 408     }
 409     if (reported->second.floor < floor) {
 410       floor = reported->second.floor;
 411     }
 412   }
 413   return floor;
 414 }
 415
 416 void LastEpochClean::dump(Formatter *f) const
 417 {
 418   f->open_array_section("per_pool");
 419
 420   for (auto& [pool, lec] : report_by_pool) {
 421     f->open_object_section("pool");
 422     f->dump_unsigned("poolid", pool);
 423     f->dump_unsigned("floor", lec.floor);
 424     f->close_section();
 425   }
 426
 427   f->close_section();
 428 }
 429
 430 class C_UpdateCreatingPGs : public Context {
 431 public:
 432   OSDMonitor *osdmon;
 433   utime_t start;
 434   epoch_t epoch;
 435   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 436     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 437   void finish(int r) override {
 438     if (r >= 0) {
 439       utime_t end = ceph_clock_now();
 440       dout(10) << "osdmap epoch " << epoch << " mapping took "
 441                << (end - start) << " seconds" << dendl;
 442       osdmon->update_creating_pgs();
 443       osdmon->check_pg_creates_subs();
 444     }
 445   }
 446 };
 447
 448 #undef dout_prefix
 449 #define dout_prefix _prefix(_dout, mon, osdmap)
 450 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
 451   return *_dout << "mon." << mon.name << "@" << mon.rank
 452                 << "(" << mon.get_state_name()
 453                 << ").osd e" << osdmap.get_epoch() << " ";
 454 }
 455
 456 OSDMonitor::OSDMonitor(
 457   CephContext *cct,
 458   Monitor &mn,
 459   Paxos &p,
 460   const string& service_name)
 461  : PaxosService(mn, p, service_name),
 462    cct(cct),
 463    inc_osd_cache(g_conf()->mon_osd_cache_size),
 464    full_osd_cache(g_conf()->mon_osd_cache_size),
 465    has_osdmap_manifest(false),
 466    mapper(mn.cct, &mn.cpu_tp)
 467 {
 468   inc_cache = std::make_shared<IncCache>(this);
 469   full_cache = std::make_shared<FullCache>(this);
 470   cct->_conf.add_observer(this);
 471   int r = _set_cache_sizes();
 472   if (r < 0) {
 473     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 474          << g_conf()->mon_osd_cache_size
 475          << ") without priority cache management"
 476          << dendl;
 477   }
 478 }
 479
 480 const char **OSDMonitor::get_tracked_conf_keys() const
 481 {
 482   static const char* KEYS[] = {
 483     "mon_memory_target",
 484     "mon_memory_autotune",
 485     "rocksdb_cache_size",
 486     NULL
 487   };
 488   return KEYS;
 489 }
 490
 491 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 492                                     const std::set<std::string> &changed)
 493 {
 494   dout(10) << __func__ << " " << changed << dendl;
 495
 496   if (changed.count("mon_memory_autotune")) {
 497     _set_cache_autotuning();
 498   }
 499   if (changed.count("mon_memory_target") ||
 500       changed.count("rocksdb_cache_size")) {
 501     int r = _update_mon_cache_settings();
 502     if (r < 0) {
 503       derr << __func__ << " mon_memory_target:"
 504            << g_conf()->mon_memory_target
 505            << " rocksdb_cache_size:"
 506            << g_conf()->rocksdb_cache_size
 507            << ". Unable to update cache size."
 508            << dendl;
 509     }
 510   }
 511 }
 512
 513 void OSDMonitor::_set_cache_autotuning()
 514 {
 515   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 516     // Disable cache autotuning
 517     std::lock_guard l(balancer_lock);
 518     pcm = nullptr;
 519   }
 520
 521   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 522     int r = register_cache_with_pcm();
 523     if (r < 0) {
 524       dout(10) << __func__
 525                << " Error while registering osdmon caches with pcm."
 526                << " Cache auto tuning not enabled."
 527                << dendl;
 528       mon_memory_autotune = false;
 529     } else {
 530       mon_memory_autotune = true;
 531     }
 532   }
 533 }
 534
 535 int OSDMonitor::_update_mon_cache_settings()
 536 {
 537   if (g_conf()->mon_memory_target <= 0 ||
 538       g_conf()->mon_memory_target < mon_memory_min ||
 539       g_conf()->rocksdb_cache_size <= 0) {
 540     return -EINVAL;
 541   }
 542
 543   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 544     derr << __func__ << " not using pcm and rocksdb" << dendl;
 545     return -EINVAL;
 546   }
 547
 548   uint64_t old_mon_memory_target = mon_memory_target;
 549   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 550
 551   // Set the new pcm memory cache sizes
 552   mon_memory_target = g_conf()->mon_memory_target;
 553   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 554
 555   uint64_t base = mon_memory_base;
 556   double fragmentation = mon_memory_fragmentation;
 557   uint64_t target = mon_memory_target;
 558   uint64_t min = mon_memory_min;
 559   uint64_t max = min;
 560
 561   uint64_t ltarget = (1.0 - fragmentation) * target;
 562   if (ltarget > base + min) {
 563     max = ltarget - base;
 564   }
 565
 566   int r = _set_cache_ratios();
 567   if (r < 0) {
 568     derr << __func__ << " Cache ratios for pcm could not be set."
 569          << " Review the kv (rocksdb) and mon_memory_target sizes."
 570          << dendl;
 571     mon_memory_target = old_mon_memory_target;
 572     rocksdb_cache_size = old_rocksdb_cache_size;
 573     return -EINVAL;
 574   }
 575
 576   if (mon_memory_autotune && pcm != nullptr) {
 577     std::lock_guard l(balancer_lock);
 578     // set pcm cache levels
 579     pcm->set_target_memory(target);
 580     pcm->set_min_memory(min);
 581     pcm->set_max_memory(max);
 582     // tune memory based on new values
 583     pcm->tune_memory();
 584     pcm->balance();
 585     _set_new_cache_sizes();
 586     dout(1) << __func__ << " Updated mon cache setting."
 587              << " target: " << target
 588              << " min: " << min
 589              << " max: " << max
 590              << dendl;
 591   }
 592   return 0;
 593 }
 594
 595 int OSDMonitor::_set_cache_sizes()
 596 {
 597   if (g_conf()->mon_memory_autotune) {
 598     // set the new osdmon cache targets to be managed by pcm
 599     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 600     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 601     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 602     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 603     mon_memory_target = g_conf()->mon_memory_target;
 604     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 605     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 606       derr << __func__ << " mon_memory_target:" << mon_memory_target
 607            << " mon_memory_min:" << mon_memory_min
 608            << ". Invalid size option(s) provided."
 609            << dendl;
 610       return -EINVAL;
 611     }
 612     // Set the initial inc and full LRU cache sizes
 613     inc_osd_cache.set_bytes(mon_memory_min);
 614     full_osd_cache.set_bytes(mon_memory_min);
 615     mon_memory_autotune = g_conf()->mon_memory_autotune;
 616   }
 617   return 0;
 618 }
 619
 620 bool OSDMonitor::_have_pending_crush()
 621 {
 622   return pending_inc.crush.length() > 0;
 623 }
 624
 625 CrushWrapper &OSDMonitor::_get_stable_crush()
 626 {
 627   return *osdmap.crush;
 628 }
 629
 630 CrushWrapper OSDMonitor::_get_pending_crush()
 631 {
 632   bufferlist bl;
 633   if (pending_inc.crush.length())
 634     bl = pending_inc.crush;
 635   else
 636     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 637
 638   auto p = bl.cbegin();
 639   CrushWrapper crush;
 640   crush.decode(p);
 641   return crush;
 642 }
 643
 644 void OSDMonitor::create_initial()
 645 {
 646   dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
 647
 648   OSDMap newmap;
 649
 650   bufferlist bl;
 651   mon.store->get("mkfs", "osdmap", bl);
 652
 653   if (bl.length()) {
 654     newmap.decode(bl);
 655     newmap.set_fsid(mon.monmap->fsid);
 656   } else {
 657     newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
 658   }
 659   newmap.set_epoch(1);
 660   newmap.created = newmap.modified = ceph_clock_now();
 661
 662   // new clusters should sort bitwise by default.
 663   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 664
 665   newmap.flags |=
 666     CEPH_OSDMAP_RECOVERY_DELETES |
 667     CEPH_OSDMAP_PURGED_SNAPDIRS |
 668     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 669   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 670   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 671   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 672   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 673   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 674   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 675
 676   // new cluster should require latest by default
 677   if (g_conf().get_val<bool>("mon_debug_no_require_reef")) {
 678     if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
 679       derr << __func__ << " mon_debug_no_require_reef and quincy=true" << dendl;
 680       newmap.require_osd_release = ceph_release_t::pacific;
 681     } else {
 682       derr << __func__ << " mon_debug_no_require_reef=true" << dendl;
 683       newmap.require_osd_release = ceph_release_t::quincy;
 684     }
 685   } else {
 686     newmap.require_osd_release = ceph_release_t::reef;
 687   }
 688
 689   ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
 690   if (!r) {
 691     ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 692   }
 693   newmap.require_min_compat_client = r;
 694
 695   // encode into pending incremental
 696   uint64_t features = newmap.get_encoding_features();
 697   newmap.encode(pending_inc.fullmap,
 698                 features | CEPH_FEATURE_RESERVED);
 699   pending_inc.full_crc = newmap.get_crc();
 700   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 701 }
 702
 703 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 704 {
 705   s.insert(service_name);
 706   s.insert(OSD_PG_CREATING_PREFIX);
 707   s.insert(OSD_METADATA_PREFIX);
 708   s.insert(OSD_SNAP_PREFIX);
 709 }
 710
 711 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 712 {
 713   // we really don't care if the version has been updated, because we may
 714   // have trimmed without having increased the last committed; yet, we may
 715   // need to update the in-memory manifest.
 716   load_osdmap_manifest();
 717
 718   version_t version = get_last_committed();
 719   if (version == osdmap.epoch)
 720     return;
 721   ceph_assert(version > osdmap.epoch);
 722
 723   dout(15) << "update_from_paxos paxos e " << version
 724            << ", my e " << osdmap.epoch << dendl;
 725
 726   int prev_num_up_osd = osdmap.num_up_osd;
 727
 728   if (mapping_job) {
 729     if (!mapping_job->is_done()) {
 730       dout(1) << __func__ << " mapping job "
 731               << mapping_job.get() << " did not complete, "
 732               << mapping_job->shards << " left, canceling" << dendl;
 733       mapping_job->abort();
 734     }
 735     mapping_job.reset();
 736   }
 737
 738   load_health();
 739
 740   /*
 741    * We will possibly have a stashed latest that *we* wrote, and we will
 742    * always be sure to have the oldest full map in the first..last range
 743    * due to encode_trim_extra(), which includes the oldest full map in the trim
 744    * transaction.
 745    *
 746    * encode_trim_extra() does not however write the full map's
 747    * version to 'full_latest'.  This is only done when we are building the
 748    * full maps from the incremental versions.  But don't panic!  We make sure
 749    * that the following conditions find whichever full map version is newer.
 750    */
 751   version_t latest_full = get_version_latest_full();
 752   if (latest_full == 0 && get_first_committed() > 1)
 753     latest_full = get_first_committed();
 754
 755   if (get_first_committed() > 1 &&
 756       latest_full < get_first_committed()) {
 757     // the monitor could be just sync'ed with its peer, and the latest_full key
 758     // is not encoded in the paxos commits in encode_pending(), so we need to
 759     // make sure we get it pointing to a proper version.
 760     version_t lc = get_last_committed();
 761     version_t fc = get_first_committed();
 762
 763     dout(10) << __func__ << " looking for valid full map in interval"
 764              << " [" << fc << ", " << lc << "]" << dendl;
 765
 766     latest_full = 0;
 767     for (version_t v = lc; v >= fc; v--) {
 768       string full_key = "full_" + stringify(v);
 769       if (mon.store->exists(get_service_name(), full_key)) {
 770         dout(10) << __func__ << " found latest full map v " << v << dendl;
 771         latest_full = v;
 772         break;
 773       }
 774     }
 775
 776     ceph_assert(latest_full > 0);
 777     auto t(std::make_shared<MonitorDBStore::Transaction>());
 778     put_version_latest_full(t, latest_full);
 779     mon.store->apply_transaction(t);
 780     dout(10) << __func__ << " updated the on-disk full map version to "
 781              << latest_full << dendl;
 782   }
 783
 784   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 785     bufferlist latest_bl;
 786     get_version_full(latest_full, latest_bl);
 787     ceph_assert(latest_bl.length() != 0);
 788     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 789     osdmap = OSDMap();
 790     osdmap.decode(latest_bl);
 791   }
 792
 793   bufferlist bl;
 794   if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 795     auto p = bl.cbegin();
 796     std::lock_guard<std::mutex> l(creating_pgs_lock);
 797     creating_pgs.decode(p);
 798     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 799             << creating_pgs.last_scan_epoch
 800             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 801   } else {
 802     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 803             << dendl;
 804   }
 805
 806   // walk through incrementals
 807   MonitorDBStore::TransactionRef t;
 808   size_t tx_size = 0;
 809   while (version > osdmap.epoch) {
 810     bufferlist inc_bl;
 811     int err = get_version(osdmap.epoch+1, inc_bl);
 812     ceph_assert(err == 0);
 813     ceph_assert(inc_bl.length());
 814     // set priority cache manager levels if the osdmap is
 815     // being populated for the first time.
 816     if (mon_memory_autotune && pcm == nullptr) {
 817       int r = register_cache_with_pcm();
 818       if (r < 0) {
 819         dout(10) << __func__
 820                  << " Error while registering osdmon caches with pcm."
 821                  << " Proceeding without cache auto tuning."
 822                  << dendl;
 823       }
 824     }
 825
 826     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 827             << dendl;
 828     OSDMap::Incremental inc(inc_bl);
 829     err = osdmap.apply_incremental(inc);
 830     ceph_assert(err == 0);
 831
 832     if (!t)
 833       t.reset(new MonitorDBStore::Transaction);
 834
 835     // Write out the full map for all past epochs.  Encode the full
 836     // map with the same features as the incremental.  If we don't
 837     // know, use the quorum features.  If we don't know those either,
 838     // encode with all features.
 839     uint64_t f = inc.encode_features;
 840     if (!f)
 841       f = mon.get_quorum_con_features();
 842     if (!f)
 843       f = -1;
 844     bufferlist full_bl;
 845     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 846     tx_size += full_bl.length();
 847
 848     bufferlist orig_full_bl;
 849     get_version_full(osdmap.epoch, orig_full_bl);
 850     if (orig_full_bl.length()) {
 851       // the primary provided the full map
 852       ceph_assert(inc.have_crc);
 853       if (inc.full_crc != osdmap.crc) {
 854         // This will happen if the mons were running mixed versions in
 855         // the past or some other circumstance made the full encoded
 856         // maps divergent.  Reloading here will bring us back into
 857         // sync with the primary for this and all future maps.  OSDs
 858         // will also be brought back into sync when they discover the
 859         // crc mismatch and request a full map from a mon.
 860         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 861              << dendl;
 862
 863         dout(20) << __func__ << " my (bad) full osdmap:\n";
 864         JSONFormatter jf(true);
 865         jf.dump_object("osdmap", osdmap);
 866         jf.flush(*_dout);
 867         *_dout << "\nhexdump:\n";
 868         full_bl.hexdump(*_dout);
 869         *_dout << dendl;
 870
 871         osdmap = OSDMap();
 872         osdmap.decode(orig_full_bl);
 873
 874         dout(20) << __func__ << " canonical full osdmap:\n";
 875         JSONFormatter jf(true);
 876         jf.dump_object("osdmap", osdmap);
 877         jf.flush(*_dout);
 878         *_dout << "\nhexdump:\n";
 879         orig_full_bl.hexdump(*_dout);
 880         *_dout << dendl;
 881       }
 882     } else {
 883       ceph_assert(!inc.have_crc);
 884       put_version_full(t, osdmap.epoch, full_bl);
 885     }
 886     put_version_latest_full(t, osdmap.epoch);
 887
 888     // share
 889     dout(1) << osdmap << dendl;
 890
 891     if (osdmap.epoch == 1) {
 892       t->erase("mkfs", "osdmap");
 893     }
 894
 895     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 896       mon.store->apply_transaction(t);
 897       t = MonitorDBStore::TransactionRef();
 898       tx_size = 0;
 899     }
 900     for (auto [osd, state] : inc.new_state) {
 901       if (state & CEPH_OSD_UP) {
 902         // could be marked up *or* down, but we're too lazy to check which
 903         last_osd_report.erase(osd);
 904       }
 905     }
 906     for (auto [osd, weight] : inc.new_weight) {
 907       if (weight == CEPH_OSD_OUT) {
 908         // manually marked out, so drop it
 909         osd_epochs.erase(osd);
 910       }
 911     }
 912   }
 913
 914   if (t) {
 915     mon.store->apply_transaction(t);
 916   }
 917
 918   bool marked_osd_down = false;
 919   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 920     if (osdmap.is_out(o))
 921       continue;
 922     auto found = down_pending_out.find(o);
 923     if (osdmap.is_down(o)) {
 924       // populate down -> out map
 925       if (found == down_pending_out.end()) {
 926         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 927         down_pending_out[o] = ceph_clock_now();
 928         marked_osd_down = true;
 929       }
 930     } else {
 931       if (found != down_pending_out.end()) {
 932         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 933         down_pending_out.erase(found);
 934       }
 935     }
 936   }
 937   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 938
 939   check_osdmap_subs();
 940   check_pg_creates_subs();
 941
 942   share_map_with_random_osd();
 943   update_logger();
 944   process_failures();
 945
 946   // make sure our feature bits reflect the latest map
 947   update_msgr_features();
 948
 949   if (!mon.is_leader()) {
 950     // will be called by on_active() on the leader, avoid doing so twice
 951     start_mapping();
 952   }
 953   if (osdmap.stretch_mode_enabled) {
 954     dout(20) << "Stretch mode enabled in this map" << dendl;
 955     mon.try_engage_stretch_mode();
 956     if (osdmap.degraded_stretch_mode) {
 957       dout(20) << "Degraded stretch mode set in this map" << dendl;
 958       if (!osdmap.recovering_stretch_mode) {
 959         mon.set_degraded_stretch_mode();
 960   dout(20) << "prev_num_up_osd: " << prev_num_up_osd << dendl;
 961   dout(20) << "osdmap.num_up_osd: " << osdmap.num_up_osd << dendl;
 962   dout(20) << "osdmap.num_osd: " << osdmap.num_osd << dendl;
 963   dout(20) << "mon_stretch_cluster_recovery_ratio: " << cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") << dendl;
 964         if (prev_num_up_osd < osdmap.num_up_osd &&
 965             (osdmap.num_up_osd / (double)osdmap.num_osd) >
 966             cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio") &&
 967       mon.dead_mon_buckets.size() == 0) {
 968           // TODO: This works for 2-site clusters when the OSD maps are appropriately
 969           // trimmed and everything is "normal" but not if you have a lot of out OSDs
 970           // you're ignoring or in some really degenerate failure cases
 971
 972           dout(10) << "Enabling recovery stretch mode in this map" << dendl;
 973           mon.go_recovery_stretch_mode();
 974         }
 975       } else {
 976         mon.set_recovery_stretch_mode();
 977       }
 978     } else {
 979       mon.set_healthy_stretch_mode();
 980     }
 981     if (marked_osd_down &&
 982         (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
 983       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
 984       mon.maybe_go_degraded_stretch_mode();
 985     }
 986   }
 987 }
 988
 989 int OSDMonitor::register_cache_with_pcm()
 990 {
 991   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 992     derr << __func__ << " Invalid memory size specified for mon caches."
 993          << " Caches will not be auto-tuned."
 994          << dendl;
 995     return -EINVAL;
 996   }
 997   uint64_t base = mon_memory_base;
 998   double fragmentation = mon_memory_fragmentation;
 999   // For calculating total target memory, consider rocksdb cache size.
1000   uint64_t target = mon_memory_target;
1001   uint64_t min = mon_memory_min;
1002   uint64_t max = min;
1003
1004   // Apply the same logic as in bluestore to set the max amount
1005   // of memory to use for cache. Assume base memory for OSDMaps
1006   // and then add in some overhead for fragmentation.
1007   uint64_t ltarget = (1.0 - fragmentation) * target;
1008   if (ltarget > base + min) {
1009     max = ltarget - base;
1010   }
1011
1012   rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1013   if (!rocksdb_binned_kv_cache) {
1014     derr << __func__ << " not using rocksdb" << dendl;
1015     return -EINVAL;
1016   }
1017
1018   int r = _set_cache_ratios();
1019   if (r < 0) {
1020     derr << __func__ << " Cache ratios for pcm could not be set."
1021          << " Review the kv (rocksdb) and mon_memory_target sizes."
1022          << dendl;
1023     return -EINVAL;
1024   }
1025
1026   pcm = std::make_shared<PriorityCache::Manager>(
1027       cct, min, max, target, true);
1028   pcm->insert("kv", rocksdb_binned_kv_cache, true);
1029   pcm->insert("inc", inc_cache, true);
1030   pcm->insert("full", full_cache, true);
1031   dout(1) << __func__ << " pcm target: " << target
1032            << " pcm max: " << max
1033            << " pcm min: " << min
1034            << " inc_osd_cache size: " << inc_osd_cache.get_size()
1035            << dendl;
1036   return 0;
1037 }
1038
1039 int OSDMonitor::_set_cache_ratios()
1040 {
1041   double old_cache_kv_ratio = cache_kv_ratio;
1042
1043   // Set the cache ratios for kv(rocksdb), inc and full caches
1044   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1045   if (cache_kv_ratio >= 1.0) {
1046     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1047          << ") must be in range [0,<1.0]."
1048          << dendl;
1049     cache_kv_ratio = old_cache_kv_ratio;
1050     return -EINVAL;
1051   }
1052   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1053   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1054   inc_cache->set_cache_ratio(cache_inc_ratio);
1055   full_cache->set_cache_ratio(cache_full_ratio);
1056
1057   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1058            << " inc ratio " << cache_inc_ratio
1059            << " full ratio " << cache_full_ratio
1060            << dendl;
1061   return 0;
1062 }
1063
1064 void OSDMonitor::start_mapping()
1065 {
1066   // initiate mapping job
1067   if (mapping_job) {
1068     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1069              << dendl;
1070     mapping_job->abort();
1071   }
1072   if (!osdmap.get_pools().empty()) {
1073     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1074     mapping_job = mapping.start_update(osdmap, mapper,
1075                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
1076     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1077              << " at " << fin->start << dendl;
1078     mapping_job->set_finish_event(fin);
1079   } else {
1080     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1081     mapping_job = nullptr;
1082   }
1083 }
1084
1085 void OSDMonitor::update_msgr_features()
1086 {
1087   const int types[] = {
1088     entity_name_t::TYPE_OSD,
1089     entity_name_t::TYPE_CLIENT,
1090     entity_name_t::TYPE_MDS,
1091     entity_name_t::TYPE_MON
1092   };
1093   for (int type : types) {
1094     uint64_t mask;
1095     uint64_t features = osdmap.get_features(type, &mask);
1096     if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1097       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1098       ceph::net::Policy p = mon.messenger->get_policy(type);
1099       p.features_required = (p.features_required & ~mask) | features;
1100       mon.messenger->set_policy(type, p);
1101     }
1102   }
1103 }
1104
1105 void OSDMonitor::on_active()
1106 {
1107   update_logger();
1108
1109   if (mon.is_leader()) {
1110     mon.clog->debug() << "osdmap " << osdmap;
1111     if (!priority_convert) {
1112       // Only do this once at start-up
1113       convert_pool_priorities();
1114       priority_convert = true;
1115     }
1116   } else {
1117     list<MonOpRequestRef> ls;
1118     take_all_failures(ls);
1119     while (!ls.empty()) {
1120       MonOpRequestRef op = ls.front();
1121       op->mark_osdmon_event(__func__);
1122       dispatch(op);
1123       ls.pop_front();
1124     }
1125   }
1126   start_mapping();
1127 }
1128
1129 void OSDMonitor::on_restart()
1130 {
1131   last_osd_report.clear();
1132 }
1133
1134 void OSDMonitor::on_shutdown()
1135 {
1136   dout(10) << __func__ << dendl;
1137   if (mapping_job) {
1138     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1139              << dendl;
1140     mapping_job->abort();
1141   }
1142
1143   // discard failure info, waiters
1144   list<MonOpRequestRef> ls;
1145   take_all_failures(ls);
1146   ls.clear();
1147 }
1148
1149 void OSDMonitor::update_logger()
1150 {
1151   dout(10) << "update_logger" << dendl;
1152
1153   mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1154   mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1155   mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1156   mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1157 }
1158
1159 void OSDMonitor::create_pending()
1160 {
1161   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1162   pending_inc.fsid = mon.monmap->fsid;
1163   pending_metadata.clear();
1164   pending_metadata_rm.clear();
1165   pending_pseudo_purged_snaps.clear();
1166
1167   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1168
1169   // safety checks (this shouldn't really happen)
1170   {
1171     if (osdmap.backfillfull_ratio <= 0) {
1172       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1173       if (pending_inc.new_backfillfull_ratio > 1.0)
1174         pending_inc.new_backfillfull_ratio /= 100;
1175       dout(1) << __func__ << " setting backfillfull_ratio = "
1176               << pending_inc.new_backfillfull_ratio << dendl;
1177     }
1178     if (osdmap.full_ratio <= 0) {
1179       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1180       if (pending_inc.new_full_ratio > 1.0)
1181         pending_inc.new_full_ratio /= 100;
1182       dout(1) << __func__ << " setting full_ratio = "
1183               << pending_inc.new_full_ratio << dendl;
1184     }
1185     if (osdmap.nearfull_ratio <= 0) {
1186       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1187       if (pending_inc.new_nearfull_ratio > 1.0)
1188         pending_inc.new_nearfull_ratio /= 100;
1189       dout(1) << __func__ << " setting nearfull_ratio = "
1190               << pending_inc.new_nearfull_ratio << dendl;
1191     }
1192   }
1193 }
1194
1195 creating_pgs_t
1196 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1197                                const OSDMap& nextmap)
1198 {
1199   dout(10) << __func__ << dendl;
1200   creating_pgs_t pending_creatings;
1201   {
1202     std::lock_guard<std::mutex> l(creating_pgs_lock);
1203     pending_creatings = creating_pgs;
1204   }
1205   // check for new or old pools
1206   if (pending_creatings.last_scan_epoch < inc.epoch) {
1207     unsigned queued = 0;
1208     queued += scan_for_creating_pgs(osdmap.get_pools(),
1209                                     inc.old_pools,
1210                                     inc.modified,
1211                                     &pending_creatings);
1212     queued += scan_for_creating_pgs(inc.new_pools,
1213                                     inc.old_pools,
1214                                     inc.modified,
1215                                     &pending_creatings);
1216     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1217     for (auto deleted_pool : inc.old_pools) {
1218       auto removed = pending_creatings.remove_pool(deleted_pool);
1219       dout(10) << __func__ << " " << removed
1220                << " pg removed because containing pool deleted: "
1221                << deleted_pool << dendl;
1222       last_epoch_clean.remove_pool(deleted_pool);
1223     }
1224     // pgmon updates its creating_pgs in check_osd_map() which is called by
1225     // on_active() and check_osd_map() could be delayed if lease expires, so its
1226     // creating_pgs could be stale in comparison with the one of osdmon. let's
1227     // trim them here. otherwise, they will be added back after being erased.
1228     unsigned removed = 0;
1229     for (auto& pg : pending_created_pgs) {
1230       dout(20) << __func__ << " noting created pg " << pg << dendl;
1231       pending_creatings.created_pools.insert(pg.pool());
1232       removed += pending_creatings.pgs.erase(pg);
1233     }
1234     pending_created_pgs.clear();
1235     dout(10) << __func__ << " " << removed
1236              << " pgs removed because they're created" << dendl;
1237     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1238   }
1239
1240   // filter out any pgs that shouldn't exist.
1241   {
1242     auto i = pending_creatings.pgs.begin();
1243     while (i != pending_creatings.pgs.end()) {
1244       if (!nextmap.pg_exists(i->first)) {
1245         dout(10) << __func__ << " removing pg " << i->first
1246                  << " which should not exist" << dendl;
1247         i = pending_creatings.pgs.erase(i);
1248       } else {
1249         ++i;
1250       }
1251     }
1252   }
1253
1254   // process queue
1255   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1256   const auto total = pending_creatings.pgs.size();
1257   while (pending_creatings.pgs.size() < max &&
1258          !pending_creatings.queue.empty()) {
1259     auto p = pending_creatings.queue.begin();
1260     int64_t poolid = p->first;
1261     dout(10) << __func__ << " pool " << poolid
1262              << " created " << p->second.created
1263              << " modified " << p->second.modified
1264              << " [" << p->second.start << "-" << p->second.end << ")"
1265              << dendl;
1266     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1267                                   p->second.end - p->second.start);
1268     ps_t first = p->second.start;
1269     ps_t end = first + n;
1270     for (ps_t ps = first; ps < end; ++ps) {
1271       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1272       // NOTE: use the *current* epoch as the PG creation epoch so that the
1273       // OSD does not have to generate a long set of PastIntervals.
1274       pending_creatings.pgs.emplace(
1275         pgid,
1276         creating_pgs_t::pg_create_info(inc.epoch,
1277                                        p->second.modified));
1278       dout(10) << __func__ << " adding " << pgid << dendl;
1279     }
1280     p->second.start = end;
1281     if (p->second.done()) {
1282       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1283       pending_creatings.queue.erase(p);
1284     } else {
1285       dout(10) << __func__ << " pool " << poolid
1286                << " now [" << p->second.start << "-" << p->second.end << ")"
1287                << dendl;
1288     }
1289   }
1290   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1291            << " pools" << dendl;
1292
1293   if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1294     // walk creating pgs' history and past_intervals forward
1295     for (auto& i : pending_creatings.pgs) {
1296       // this mirrors PG::start_peering_interval()
1297       pg_t pgid = i.first;
1298
1299       // this is a bit imprecise, but sufficient?
1300       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1301         const pg_pool_t *pi;
1302         bool operator()(const set<pg_shard_t> &have) const {
1303           return have.size() >= pi->min_size;
1304         }
1305         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1306       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1307
1308       vector<int> up, acting;
1309       int up_primary, acting_primary;
1310       nextmap.pg_to_up_acting_osds(
1311         pgid, &up, &up_primary, &acting, &acting_primary);
1312       if (i.second.history.epoch_created == 0) {
1313         // new pg entry, set it up
1314         i.second.up = up;
1315         i.second.acting = acting;
1316         i.second.up_primary = up_primary;
1317         i.second.acting_primary = acting_primary;
1318         i.second.history = pg_history_t(i.second.create_epoch,
1319                                         i.second.create_stamp);
1320         dout(10) << __func__ << "  pg " << pgid << " just added, "
1321                  << " up " << i.second.up
1322                  << " p " << i.second.up_primary
1323                  << " acting " << i.second.acting
1324                  << " p " << i.second.acting_primary
1325                  << " history " << i.second.history
1326                  << " past_intervals " << i.second.past_intervals
1327                  << dendl;
1328      } else {
1329         std::stringstream debug;
1330         if (PastIntervals::check_new_interval(
1331               i.second.acting_primary, acting_primary,
1332               i.second.acting, acting,
1333               i.second.up_primary, up_primary,
1334               i.second.up, up,
1335               i.second.history.same_interval_since,
1336               i.second.history.last_epoch_clean,
1337               &nextmap,
1338               &osdmap,
1339               pgid,
1340               min_size_predicate,
1341               &i.second.past_intervals,
1342               &debug)) {
1343           epoch_t e = inc.epoch;
1344           i.second.history.same_interval_since = e;
1345           if (i.second.up != up) {
1346             i.second.history.same_up_since = e;
1347           }
1348           if (i.second.acting_primary != acting_primary) {
1349             i.second.history.same_primary_since = e;
1350           }
1351           if (pgid.is_split(
1352                 osdmap.get_pg_num(pgid.pool()),
1353                 nextmap.get_pg_num(pgid.pool()),
1354                 nullptr)) {
1355             i.second.history.last_epoch_split = e;
1356           }
1357           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1358                    << " up " << i.second.up << " -> " << up
1359                    << " p " << i.second.up_primary << " -> " << up_primary
1360                    << " acting " << i.second.acting << " -> " << acting
1361                    << " p " << i.second.acting_primary << " -> "
1362                    << acting_primary
1363                    << " history " << i.second.history
1364                    << " past_intervals " << i.second.past_intervals
1365                    << dendl;
1366           dout(20) << "  debug: " << debug.str() << dendl;
1367           i.second.up = up;
1368           i.second.acting = acting;
1369           i.second.up_primary = up_primary;
1370           i.second.acting_primary = acting_primary;
1371         }
1372       }
1373     }
1374   }
1375   dout(10) << __func__
1376            << " " << (pending_creatings.pgs.size() - total)
1377            << "/" << pending_creatings.pgs.size()
1378            << " pgs added from queued pools" << dendl;
1379   return pending_creatings;
1380 }
1381
1382 void OSDMonitor::maybe_prime_pg_temp()
1383 {
1384   bool all = false;
1385   if (pending_inc.crush.length()) {
1386     dout(10) << __func__ << " new crush map, all" << dendl;
1387     all = true;
1388   }
1389
1390   if (!pending_inc.new_up_client.empty()) {
1391     dout(10) << __func__ << " new up osds, all" << dendl;
1392     all = true;
1393   }
1394
1395   // check for interesting OSDs
1396   set<int> osds;
1397   for (auto p = pending_inc.new_state.begin();
1398        !all && p != pending_inc.new_state.end();
1399        ++p) {
1400     if ((p->second & CEPH_OSD_UP) &&
1401         osdmap.is_up(p->first)) {
1402       osds.insert(p->first);
1403     }
1404   }
1405   for (auto p = pending_inc.new_weight.begin();
1406        !all && p != pending_inc.new_weight.end();
1407        ++p) {
1408     if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1409       // weight reduction
1410       osds.insert(p->first);
1411     } else {
1412       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1413                << dendl;
1414       all = true;
1415     }
1416   }
1417
1418   if (!all && osds.empty())
1419     return;
1420
1421   if (!all) {
1422     unsigned estimate =
1423       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1424     if (estimate > mapping.get_num_pgs() *
1425         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1426       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1427                << osds.size() << " osds >= "
1428                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1429                << mapping.get_num_pgs() << " pgs, all"
1430                << dendl;
1431       all = true;
1432     } else {
1433       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1434                << osds.size() << " osds" << dendl;
1435     }
1436   }
1437
1438   OSDMap next;
1439   next.deepish_copy_from(osdmap);
1440   next.apply_incremental(pending_inc);
1441
1442   if (next.get_pools().empty()) {
1443     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1444   } else if (all) {
1445     PrimeTempJob job(next, this);
1446     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1447     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1448       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1449     } else {
1450       dout(10) << __func__ << " did not finish in "
1451                << g_conf()->mon_osd_prime_pg_temp_max_time
1452                << ", stopping" << dendl;
1453       job.abort();
1454     }
1455   } else {
1456     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1457     utime_t stop = ceph_clock_now();
1458     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1459     const int chunk = 1000;
1460     int n = chunk;
1461     std::unordered_set<pg_t> did_pgs;
1462     for (auto osd : osds) {
1463       auto& pgs = mapping.get_osd_acting_pgs(osd);
1464       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1465       for (auto pgid : pgs) {
1466         if (!did_pgs.insert(pgid).second) {
1467           continue;
1468         }
1469         prime_pg_temp(next, pgid);
1470         if (--n <= 0) {
1471           n = chunk;
1472           if (ceph_clock_now() > stop) {
1473             dout(10) << __func__ << " consumed more than "
1474                      << g_conf()->mon_osd_prime_pg_temp_max_time
1475                      << " seconds, stopping"
1476                      << dendl;
1477             return;
1478           }
1479         }
1480       }
1481     }
1482   }
1483 }
1484
1485 void OSDMonitor::prime_pg_temp(
1486   const OSDMap& next,
1487   pg_t pgid)
1488 {
1489   // TODO: remove this creating_pgs direct access?
1490   if (creating_pgs.pgs.count(pgid)) {
1491     return;
1492   }
1493   if (!osdmap.pg_exists(pgid)) {
1494     return;
1495   }
1496
1497   vector<int> up, acting;
1498   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1499
1500   vector<int> next_up, next_acting;
1501   int next_up_primary, next_acting_primary;
1502   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1503                             &next_acting, &next_acting_primary);
1504   if (acting == next_acting &&
1505       !(up != acting && next_up == next_acting))
1506     return;  // no change since last epoch
1507
1508   if (acting.empty())
1509     return;  // if previously empty now we can be no worse off
1510   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1511   if (pool && acting.size() < pool->min_size)
1512     return;  // can be no worse off than before
1513
1514   if (next_up == next_acting) {
1515     acting.clear();
1516     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1517              << dendl;
1518   }
1519
1520   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1521            << " -> " << next_up << "/" << next_acting
1522            << ", priming " << acting
1523            << dendl;
1524   {
1525     std::lock_guard l(prime_pg_temp_lock);
1526     // do not touch a mapping if a change is pending
1527     pending_inc.new_pg_temp.emplace(
1528       pgid,
1529       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1530   }
1531 }
1532
1533 /**
1534  * @note receiving a transaction in this function gives a fair amount of
1535  * freedom to the service implementation if it does need it. It shouldn't.
1536  */
1537 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1538 {
1539   dout(10) << "encode_pending e " << pending_inc.epoch
1540            << dendl;
1541
1542   if (do_prune(t)) {
1543     dout(1) << __func__ << " osdmap full prune encoded e"
1544             << pending_inc.epoch << dendl;
1545   }
1546
1547   // finalize up pending_inc
1548   pending_inc.modified = ceph_clock_now();
1549
1550   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1551   ceph_assert(r == 0);
1552
1553   if (mapping_job) {
1554     if (!mapping_job->is_done()) {
1555       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1556               << mapping_job.get() << " did not complete, "
1557               << mapping_job->shards << " left" << dendl;
1558       mapping_job->abort();
1559     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1560       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1561               << mapping_job.get() << " is prior epoch "
1562               << mapping.get_epoch() << dendl;
1563     } else {
1564       if (g_conf()->mon_osd_prime_pg_temp) {
1565         maybe_prime_pg_temp();
1566       }
1567     }
1568   } else if (g_conf()->mon_osd_prime_pg_temp) {
1569     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1570             << dendl;
1571   }
1572   mapping_job.reset();
1573
1574   // ensure we don't have blank new_state updates.  these are interrpeted as
1575   // CEPH_OSD_UP (and almost certainly not what we want!).
1576   auto p = pending_inc.new_state.begin();
1577   while (p != pending_inc.new_state.end()) {
1578     if (p->second == 0) {
1579       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1580       p = pending_inc.new_state.erase(p);
1581     } else {
1582       if (p->second & CEPH_OSD_UP) {
1583         pending_inc.new_last_up_change = pending_inc.modified;
1584       }
1585       ++p;
1586     }
1587   }
1588   if (!pending_inc.new_up_client.empty()) {
1589     pending_inc.new_last_up_change = pending_inc.modified;
1590   }
1591   for (auto& i : pending_inc.new_weight) {
1592     if (i.first >= osdmap.max_osd) {
1593       if (i.second) {
1594         // new osd is already marked in
1595         pending_inc.new_last_in_change = pending_inc.modified;
1596         break;
1597       }
1598     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1599       // existing osd marked in or out
1600       pending_inc.new_last_in_change = pending_inc.modified;
1601       break;
1602     }
1603   }
1604
1605   {
1606     OSDMap tmp;
1607     tmp.deepish_copy_from(osdmap);
1608     tmp.apply_incremental(pending_inc);
1609
1610     // clean pg_temp mappings
1611     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1612
1613     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1614     {
1615       // check every upmapped pg for now
1616       // until we could reliably identify certain cases to ignore,
1617       // which is obviously the hard part TBD..
1618       vector<pg_t> pgs_to_check;
1619       tmp.get_upmap_pgs(&pgs_to_check);
1620       if (pgs_to_check.size() <
1621           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1622         // not enough pgs, do it inline
1623         tmp.clean_pg_upmaps(cct, &pending_inc);
1624       } else {
1625         CleanUpmapJob job(cct, tmp, pending_inc);
1626         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1627         job.wait();
1628       }
1629     }
1630
1631     // update creating pgs first so that we can remove the created pgid and
1632     // process the pool flag removal below in the same osdmap epoch.
1633     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1634     bufferlist creatings_bl;
1635     uint64_t features = CEPH_FEATURES_ALL;
1636     if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1637       dout(20) << __func__ << " encoding pending pgs without octopus features"
1638                << dendl;
1639       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1640     }
1641     encode(pending_creatings, creatings_bl, features);
1642     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1643
1644     // remove any old (or incompat) POOL_CREATING flags
1645     for (auto& i : tmp.get_pools()) {
1646       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1647         // pre-nautilus OSDMaps shouldn't get this flag.
1648         if (pending_inc.new_pools.count(i.first)) {
1649           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1650         }
1651       }
1652       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1653           !pending_creatings.still_creating_pool(i.first)) {
1654         dout(10) << __func__ << " done creating pool " << i.first
1655                  << ", clearing CREATING flag" << dendl;
1656         if (pending_inc.new_pools.count(i.first) == 0) {
1657           pending_inc.new_pools[i.first] = i.second;
1658         }
1659         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1660       }
1661     }
1662
1663     // collect which pools are currently affected by
1664     // the near/backfill/full osd(s),
1665     // and set per-pool near/backfill/full flag instead
1666     set<int64_t> full_pool_ids;
1667     set<int64_t> backfillfull_pool_ids;
1668     set<int64_t> nearfull_pool_ids;
1669     tmp.get_full_pools(cct,
1670                        &full_pool_ids,
1671                        &backfillfull_pool_ids,
1672                          &nearfull_pool_ids);
1673     if (full_pool_ids.empty() ||
1674         backfillfull_pool_ids.empty() ||
1675         nearfull_pool_ids.empty()) {
1676       // normal case - no nearfull, backfillfull or full osds
1677         // try cancel any improper nearfull/backfillfull/full pool
1678         // flags first
1679       for (auto &pool: tmp.get_pools()) {
1680         auto p = pool.first;
1681         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1682             nearfull_pool_ids.empty()) {
1683           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1684                    << "'s nearfull flag" << dendl;
1685           if (pending_inc.new_pools.count(p) == 0) {
1686             // load original pool info first!
1687             pending_inc.new_pools[p] = pool.second;
1688           }
1689           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1690         }
1691         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1692             backfillfull_pool_ids.empty()) {
1693           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1694                    << "'s backfillfull flag" << dendl;
1695           if (pending_inc.new_pools.count(p) == 0) {
1696             pending_inc.new_pools[p] = pool.second;
1697           }
1698           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1699         }
1700         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1701             full_pool_ids.empty()) {
1702           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1703             // set by EQUOTA, skipping
1704             continue;
1705           }
1706           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1707                    << "'s full flag" << dendl;
1708           if (pending_inc.new_pools.count(p) == 0) {
1709             pending_inc.new_pools[p] = pool.second;
1710           }
1711           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1712         }
1713       }
1714     }
1715     if (!full_pool_ids.empty()) {
1716       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1717                << " as full" << dendl;
1718       for (auto &p: full_pool_ids) {
1719         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1720           continue;
1721         }
1722         if (pending_inc.new_pools.count(p) == 0) {
1723           pending_inc.new_pools[p] = tmp.pools[p];
1724         }
1725         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1726         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1727         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1728       }
1729       // cancel FLAG_FULL for pools which are no longer full too
1730       for (auto &pool: tmp.get_pools()) {
1731         auto p = pool.first;
1732         if (full_pool_ids.count(p)) {
1733           // skip pools we have just marked as full above
1734           continue;
1735         }
1736         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1737             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1738           // don't touch if currently is not full
1739           // or is running out of quota (and hence considered as full)
1740           continue;
1741         }
1742         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1743                  << "'s full flag" << dendl;
1744         if (pending_inc.new_pools.count(p) == 0) {
1745           pending_inc.new_pools[p] = pool.second;
1746         }
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1748       }
1749     }
1750     if (!backfillfull_pool_ids.empty()) {
1751       for (auto &p: backfillfull_pool_ids) {
1752         if (full_pool_ids.count(p)) {
1753           // skip pools we have already considered as full above
1754           continue;
1755         }
1756         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1757           // make sure FLAG_FULL is truly set, so we are safe not
1758           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1759           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1760           continue;
1761         }
1762         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1763           // don't bother if pool is already marked as backfillfull
1764           continue;
1765         }
1766         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1767                  << "'s as backfillfull" << dendl;
1768         if (pending_inc.new_pools.count(p) == 0) {
1769           pending_inc.new_pools[p] = tmp.pools[p];
1770         }
1771         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1772         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1773       }
1774       // cancel FLAG_BACKFILLFULL for pools
1775       // which are no longer backfillfull too
1776       for (auto &pool: tmp.get_pools()) {
1777         auto p = pool.first;
1778         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1779           // skip pools we have just marked as backfillfull/full above
1780           continue;
1781         }
1782         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783           // and don't touch if currently is not backfillfull
1784           continue;
1785         }
1786         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1787                  << "'s backfillfull flag" << dendl;
1788         if (pending_inc.new_pools.count(p) == 0) {
1789           pending_inc.new_pools[p] = pool.second;
1790         }
1791         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1792       }
1793     }
1794     if (!nearfull_pool_ids.empty()) {
1795       for (auto &p: nearfull_pool_ids) {
1796         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1797           continue;
1798         }
1799         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1800           // make sure FLAG_FULL is truly set, so we are safe not
1801           // to set a extra (redundant) FLAG_NEARFULL flag
1802           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1803           continue;
1804         }
1805         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1806           // don't bother if pool is already marked as nearfull
1807           continue;
1808         }
1809         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1810                  << "'s as nearfull" << dendl;
1811         if (pending_inc.new_pools.count(p) == 0) {
1812           pending_inc.new_pools[p] = tmp.pools[p];
1813         }
1814         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1815       }
1816       // cancel FLAG_NEARFULL for pools
1817       // which are no longer nearfull too
1818       for (auto &pool: tmp.get_pools()) {
1819         auto p = pool.first;
1820         if (full_pool_ids.count(p) ||
1821             backfillfull_pool_ids.count(p) ||
1822             nearfull_pool_ids.count(p)) {
1823           // skip pools we have just marked as
1824           // nearfull/backfillfull/full above
1825           continue;
1826         }
1827         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1828           // and don't touch if currently is not nearfull
1829           continue;
1830         }
1831         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1832                  << "'s nearfull flag" << dendl;
1833         if (pending_inc.new_pools.count(p) == 0) {
1834           pending_inc.new_pools[p] = pool.second;
1835         }
1836         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1837       }
1838     }
1839
1840     // min_compat_client?
1841     if (!tmp.require_min_compat_client) {
1842       auto mv = tmp.get_min_compat_client();
1843       dout(1) << __func__ << " setting require_min_compat_client to currently "
1844               << "required " << mv << dendl;
1845       mon.clog->info() << "setting require_min_compat_client to currently "
1846                         << "required " << mv;
1847       pending_inc.new_require_min_compat_client = mv;
1848     }
1849
1850     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1851         tmp.require_osd_release >= ceph_release_t::nautilus) {
1852       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1853       // add creating flags?
1854       for (auto& i : tmp.get_pools()) {
1855         if (pending_creatings.still_creating_pool(i.first)) {
1856           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1857                    << dendl;
1858           if (pending_inc.new_pools.count(i.first) == 0) {
1859             pending_inc.new_pools[i.first] = i.second;
1860           }
1861           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1862         }
1863       }
1864       // adjust blocklist items to all be TYPE_ANY
1865       for (auto& i : tmp.blocklist) {
1866         auto a = i.first;
1867         a.set_type(entity_addr_t::TYPE_ANY);
1868         pending_inc.new_blocklist[a] = i.second;
1869         pending_inc.old_blocklist.push_back(i.first);
1870       }
1871     }
1872
1873     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1874         tmp.require_osd_release >= ceph_release_t::octopus) {
1875       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1876
1877       // adjust obsoleted cache modes
1878       for (auto& [poolid, pi] : tmp.pools) {
1879         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1880           if (pending_inc.new_pools.count(poolid) == 0) {
1881             pending_inc.new_pools[poolid] = pi;
1882           }
1883           dout(10) << __func__ << " switching pool " << poolid
1884                    << " cachemode from forward -> proxy" << dendl;
1885           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1886         }
1887         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1888           if (pending_inc.new_pools.count(poolid) == 0) {
1889             pending_inc.new_pools[poolid] = pi;
1890           }
1891           dout(10) << __func__ << " switching pool " << poolid
1892                    << " cachemode from readforward -> readproxy" << dendl;
1893           pending_inc.new_pools[poolid].cache_mode =
1894             pg_pool_t::CACHEMODE_READPROXY;
1895         }
1896       }
1897
1898       // clear removed_snaps for every pool
1899       for (auto& [poolid, pi] : tmp.pools) {
1900         if (pi.removed_snaps.empty()) {
1901           continue;
1902         }
1903         if (pending_inc.new_pools.count(poolid) == 0) {
1904           pending_inc.new_pools[poolid] = pi;
1905         }
1906         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1907                  << dendl;
1908         pending_inc.new_pools[poolid].removed_snaps.clear();
1909       }
1910
1911       // create a combined purged snap epoch key for all purged snaps
1912       // prior to this epoch, and store it in the current epoch (i.e.,
1913       // the last pre-octopus epoch, just prior to the one we're
1914       // encoding now).
1915       auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1916       it->lower_bound("purged_snap_");
1917       map<int64_t,snap_interval_set_t> combined;
1918       while (it->valid()) {
1919         if (it->key().find("purged_snap_") != 0) {
1920           break;
1921         }
1922         string k = it->key();
1923         long long unsigned pool;
1924         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1925         if (n != 1) {
1926           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1927         } else {
1928           bufferlist v = it->value();
1929           auto p = v.cbegin();
1930           snapid_t begin, end;
1931           ceph::decode(begin, p);
1932           ceph::decode(end, p);
1933           combined[pool].insert(begin, end - begin);
1934         }
1935         it->next();
1936       }
1937       if (!combined.empty()) {
1938         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1939         bufferlist v;
1940         ceph::encode(combined, v);
1941         t->put(OSD_SNAP_PREFIX, k, v);
1942         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1943                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1944                  << dendl;
1945       } else {
1946         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1947                  << dendl;
1948       }
1949
1950       // clean out the old removed_snap_ and removed_epoch keys
1951       // ('`' is ASCII '_' + 1)
1952       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1953       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1954     }
1955   }
1956
1957   // tell me about it
1958   for (auto i = pending_inc.new_state.begin();
1959        i != pending_inc.new_state.end();
1960        ++i) {
1961     int s = i->second ? i->second : CEPH_OSD_UP;
1962     if (s & CEPH_OSD_UP) {
1963       dout(2) << " osd." << i->first << " DOWN" << dendl;
1964       // Reset laggy parameters if failure interval exceeds a threshold.
1965       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1966       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1967         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1968         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1969           set_default_laggy_params(i->first);
1970         }
1971       }
1972     }
1973     if (s & CEPH_OSD_EXISTS)
1974       dout(2) << " osd." << i->first << " DNE" << dendl;
1975   }
1976   for (auto i = pending_inc.new_up_client.begin();
1977        i != pending_inc.new_up_client.end();
1978        ++i) {
1979     //FIXME: insert cluster addresses too
1980     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1981   }
1982   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1983        i != pending_inc.new_weight.end();
1984        ++i) {
1985     if (i->second == CEPH_OSD_OUT) {
1986       dout(2) << " osd." << i->first << " OUT" << dendl;
1987     } else if (i->second == CEPH_OSD_IN) {
1988       dout(2) << " osd." << i->first << " IN" << dendl;
1989     } else {
1990       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1991     }
1992   }
1993
1994   // features for osdmap and its incremental
1995   uint64_t features;
1996
1997   // encode full map and determine its crc
1998   OSDMap tmp;
1999   {
2000     tmp.deepish_copy_from(osdmap);
2001     tmp.apply_incremental(pending_inc);
2002
2003     // determine appropriate features
2004     features = tmp.get_encoding_features();
2005     dout(10) << __func__ << " encoding full map with "
2006              << tmp.require_osd_release
2007              << " features " << features << dendl;
2008
2009     // the features should be a subset of the mon quorum's features!
2010     ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2011
2012     bufferlist fullbl;
2013     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2014     pending_inc.full_crc = tmp.get_crc();
2015
2016     // include full map in the txn.  note that old monitors will
2017     // overwrite this.  new ones will now skip the local full map
2018     // encode and reload from this.
2019     put_version_full(t, pending_inc.epoch, fullbl);
2020   }
2021
2022   // encode
2023   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2024   bufferlist bl;
2025   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2026
2027   dout(20) << " full_crc " << tmp.get_crc()
2028            << " inc_crc " << pending_inc.inc_crc << dendl;
2029
2030   /* put everything in the transaction */
2031   put_version(t, pending_inc.epoch, bl);
2032   put_last_committed(t, pending_inc.epoch);
2033
2034   // metadata, too!
2035   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2036        p != pending_metadata.end();
2037        ++p) {
2038     Metadata m;
2039     auto mp = p->second.cbegin();
2040     decode(m, mp);
2041     auto it = m.find("osd_objectstore");
2042     if (it != m.end()) {
2043       if (it->second == "filestore") {
2044         filestore_osds.insert(p->first);
2045       } else {
2046         filestore_osds.erase(p->first);
2047       }
2048     }
2049     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2050   }
2051   for (set<int>::iterator p = pending_metadata_rm.begin();
2052        p != pending_metadata_rm.end();
2053        ++p) {
2054     filestore_osds.erase(*p);
2055     t->erase(OSD_METADATA_PREFIX, stringify(*p));
2056   }
2057   pending_metadata.clear();
2058   pending_metadata_rm.clear();
2059
2060   // purged_snaps
2061   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2062       !pending_inc.new_purged_snaps.empty()) {
2063     // all snaps purged this epoch (across all pools)
2064     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2065     bufferlist v;
2066     encode(pending_inc.new_purged_snaps, v);
2067     t->put(OSD_SNAP_PREFIX, k, v);
2068   }
2069   for (auto& i : pending_inc.new_purged_snaps) {
2070     for (auto q = i.second.begin();
2071          q != i.second.end();
2072          ++q) {
2073       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2074                                 pending_inc.epoch,
2075                                 t);
2076     }
2077   }
2078   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2079     for (auto snap : snaps) {
2080       insert_purged_snap_update(pool, snap, snap + 1,
2081                                 pending_inc.epoch,
2082                                 t);
2083     }
2084   }
2085
2086   // health
2087   health_check_map_t next;
2088   tmp.check_health(cct, &next);
2089   // OSD_FILESTORE
2090   check_for_filestore_osds(&next);
2091   encode_health(next, t);
2092 }
2093
2094 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2095 {
2096   bufferlist bl;
2097   int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2098   if (r < 0)
2099     return r;
2100   try {
2101     auto p = bl.cbegin();
2102     decode(m, p);
2103   }
2104   catch (ceph::buffer::error& e) {
2105     if (err)
2106       *err << "osd." << osd << " metadata is corrupt";
2107     return -EIO;
2108   }
2109   return 0;
2110 }
2111
2112 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2113 {
2114   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2115     if (osdmap.is_up(osd)) {
2116       map<string,string> meta;
2117       load_metadata(osd, meta, nullptr);
2118       auto p = meta.find(field);
2119       if (p == meta.end()) {
2120         (*out)["unknown"]++;
2121       } else {
2122         (*out)[p->second]++;
2123       }
2124     }
2125   }
2126 }
2127
2128 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2129 {
2130   map<string,int> by_val;
2131   count_metadata(field, &by_val);
2132   f->open_object_section(field.c_str());
2133   for (auto& p : by_val) {
2134     f->dump_int(p.first.c_str(), p.second);
2135   }
2136   f->close_section();
2137 }
2138
2139 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2140 {
2141   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2142     if (osdmap.is_up(osd)) {
2143       map<string,string> meta;
2144       load_metadata(osd, meta, nullptr);
2145       auto p = meta.find("ceph_version_short");
2146       if (p == meta.end()) continue;
2147       versions[p->second].push_back(string("osd.") + stringify(osd));
2148     }
2149   }
2150 }
2151
2152 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2153 {
2154   map<string, string> metadata;
2155   int r = load_metadata(osd, metadata, nullptr);
2156   if (r < 0)
2157     return r;
2158
2159   auto it = metadata.find("osd_objectstore");
2160   if (it == metadata.end())
2161     return -ENOENT;
2162   *type = it->second;
2163   return 0;
2164 }
2165
2166 void OSDMonitor::get_filestore_osd_list()
2167 {
2168   for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2169     string objectstore_type;
2170     int r = get_osd_objectstore_type(osd, &objectstore_type);
2171     if (r == 0 && objectstore_type == "filestore") {
2172       filestore_osds.insert(osd);
2173     }
2174   }
2175 }
2176
2177 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2178 {
2179   if (g_conf()->mon_warn_on_filestore_osds &&
2180       filestore_osds.size() > 0) {
2181     ostringstream ss, deprecated_tip;
2182     list<string> detail;
2183     ss << filestore_osds.size()
2184        << " osd(s) "
2185        << (filestore_osds.size() == 1 ? "is" : "are")
2186        << " running Filestore";
2187     deprecated_tip << ss.str();
2188     ss << " [Deprecated]";
2189     auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2190                           filestore_osds.size());
2191     deprecated_tip << ", which has been deprecated and"
2192                    << " not been optimized for QoS"
2193                    << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2194     detail.push_back(deprecated_tip.str());
2195     d.detail.swap(detail);
2196   }
2197 }
2198
2199 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2200                                                  const pg_pool_t &pool,
2201                                                  ostream *err)
2202 {
2203   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2204   // since filestore osds could always join the pool later
2205   set<int> checked_osds;
2206   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2207     vector<int> up, acting;
2208     pg_t pgid(ps, pool_id);
2209     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2210     for (int osd : up) {
2211       if (checked_osds.find(osd) != checked_osds.end())
2212         continue;
2213       string objectstore_type;
2214       int r = get_osd_objectstore_type(osd, &objectstore_type);
2215       // allow with missing metadata, e.g. due to an osd never booting yet
2216       if (r < 0 || objectstore_type == "bluestore") {
2217         checked_osds.insert(osd);
2218         continue;
2219       }
2220       *err << "osd." << osd << " uses " << objectstore_type;
2221       return false;
2222     }
2223   }
2224   return true;
2225 }
2226
2227 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2228 {
2229   map<string,string> m;
2230   if (int r = load_metadata(osd, m, err))
2231     return r;
2232   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2233     f->dump_string(p->first.c_str(), p->second);
2234   return 0;
2235 }
2236
2237 void OSDMonitor::print_nodes(Formatter *f)
2238 {
2239   // group OSDs by their hosts
2240   map<string, list<int> > osds; // hostname => osd
2241   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2242     map<string, string> m;
2243     if (load_metadata(osd, m, NULL)) {
2244       continue;
2245     }
2246     map<string, string>::iterator hostname = m.find("hostname");
2247     if (hostname == m.end()) {
2248       // not likely though
2249       continue;
2250     }
2251     osds[hostname->second].push_back(osd);
2252   }
2253
2254   dump_services(f, osds, "osd");
2255 }
2256
2257 void OSDMonitor::share_map_with_random_osd()
2258 {
2259   if (osdmap.get_num_up_osds() == 0) {
2260     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2261     return;
2262   }
2263
2264   MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2265   if (!s) {
2266     dout(10) << __func__ << " no up osd on our session map" << dendl;
2267     return;
2268   }
2269
2270   dout(10) << "committed, telling random " << s->name
2271            << " all about it" << dendl;
2272
2273   // get feature of the peer
2274   // use quorum_con_features, if it's an anonymous connection.
2275   uint64_t features = s->con_features ? s->con_features :
2276                                         mon.get_quorum_con_features();
2277   // whatev, they'll request more if they need it
2278   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2279   s->con->send_message(m);
2280   // NOTE: do *not* record osd has up to this epoch (as we do
2281   // elsewhere) as they may still need to request older values.
2282 }
2283
2284 version_t OSDMonitor::get_trim_to() const
2285 {
2286   if (mon.get_quorum().empty()) {
2287     dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2288     return 0;
2289   }
2290
2291   {
2292     std::lock_guard<std::mutex> l(creating_pgs_lock);
2293     if (!creating_pgs.pgs.empty()) {
2294       dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2295       return 0;
2296     }
2297   }
2298
2299   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2300     dout(0) << __func__
2301             << " blocking osdmap trim"
2302             << " ('mon_debug_block_osdmap_trim' set to 'true')"
2303             << " trim_to = 0" << dendl;
2304     return 0;
2305   }
2306
2307   {
2308     epoch_t floor = get_min_last_epoch_clean();
2309     dout(10) << " min_last_epoch_clean " << floor << dendl;
2310     if (g_conf()->mon_osd_force_trim_to > 0 &&
2311         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2312       floor = g_conf()->mon_osd_force_trim_to;
2313       dout(10) << __func__
2314                << " explicit mon_osd_force_trim_to = " << floor << dendl;
2315     }
2316     unsigned min = g_conf()->mon_min_osdmap_epochs;
2317     if (floor + min > get_last_committed()) {
2318       if (min < get_last_committed())
2319         floor = get_last_committed() - min;
2320       else
2321         floor = 0;
2322     }
2323     if (floor > get_first_committed()) {
2324       dout(10) << __func__ << " trim_to = " << floor << dendl;
2325       return floor;
2326     }
2327   }
2328   dout(10) << __func__ << " trim_to = 0" << dendl;
2329   return 0;
2330 }
2331
2332 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2333 {
2334   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2335   // also scan osd epochs
2336   // don't trim past the oldest reported osd epoch
2337   for (auto [osd, epoch] : osd_epochs) {
2338     if (epoch < floor) {
2339       floor = epoch;
2340     }
2341   }
2342   return floor;
2343 }
2344
2345 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2346                                    version_t first)
2347 {
2348   dout(10) << __func__ << " including full map for e " << first << dendl;
2349   bufferlist bl;
2350   get_version_full(first, bl);
2351   put_version_full(tx, first, bl);
2352
2353   if (has_osdmap_manifest &&
2354       first > osdmap_manifest.get_first_pinned()) {
2355     _prune_update_trimmed(tx, first);
2356   }
2357 }
2358
2359
2360 /* full osdmap prune
2361  *
2362  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2363  */
2364
2365 void OSDMonitor::load_osdmap_manifest()
2366 {
2367   bool store_has_manifest =
2368     mon.store->exists(get_service_name(), "osdmap_manifest");
2369
2370   if (!store_has_manifest) {
2371     if (!has_osdmap_manifest) {
2372       return;
2373     }
2374
2375     dout(20) << __func__
2376              << " dropping osdmap manifest from memory." << dendl;
2377     osdmap_manifest = osdmap_manifest_t();
2378     has_osdmap_manifest = false;
2379     return;
2380   }
2381
2382   dout(20) << __func__
2383            << " osdmap manifest detected in store; reload." << dendl;
2384
2385   bufferlist manifest_bl;
2386   int r = get_value("osdmap_manifest", manifest_bl);
2387   if (r < 0) {
2388     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2389     ceph_abort_msg("error reading manifest");
2390   }
2391   osdmap_manifest.decode(manifest_bl);
2392   has_osdmap_manifest = true;
2393
2394   dout(10) << __func__ << " store osdmap manifest pinned ("
2395            << osdmap_manifest.get_first_pinned()
2396            << " .. "
2397            << osdmap_manifest.get_last_pinned()
2398            << ")"
2399            << dendl;
2400 }
2401
2402 bool OSDMonitor::should_prune() const
2403 {
2404   version_t first = get_first_committed();
2405   version_t last = get_last_committed();
2406   version_t min_osdmap_epochs =
2407     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2408   version_t prune_min =
2409     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2410   version_t prune_interval =
2411     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2412   version_t last_pinned = osdmap_manifest.get_last_pinned();
2413   version_t last_to_pin = last - min_osdmap_epochs;
2414
2415   // Make it or break it constraints.
2416   //
2417   // If any of these conditions fails, we will not prune, regardless of
2418   // whether we have an on-disk manifest with an on-going pruning state.
2419   //
2420   if ((last - first) <= min_osdmap_epochs) {
2421     // between the first and last committed epochs, we don't have
2422     // enough epochs to trim, much less to prune.
2423     dout(10) << __func__
2424              << " currently holding only " << (last - first)
2425              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2426              << "); do not prune."
2427              << dendl;
2428     return false;
2429
2430   } else if ((last_to_pin - first) < prune_min) {
2431     // between the first committed epoch and the last epoch we would prune,
2432     // we simply don't have enough versions over the minimum to prune maps.
2433     dout(10) << __func__
2434              << " could only prune " << (last_to_pin - first)
2435              << " epochs (" << first << ".." << last_to_pin << "), which"
2436                 " is less than the required minimum (" << prune_min << ")"
2437              << dendl;
2438     return false;
2439
2440   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2441     dout(10) << __func__
2442              << " we have pruned as far as we can; do not prune."
2443              << dendl;
2444     return false;
2445
2446   } else if (last_pinned + prune_interval > last_to_pin) {
2447     dout(10) << __func__
2448              << " not enough epochs to form an interval (last pinned: "
2449              << last_pinned << ", last to pin: "
2450              << last_to_pin << ", interval: " << prune_interval << ")"
2451              << dendl;
2452     return false;
2453   }
2454
2455   dout(15) << __func__
2456            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2457            << " lc (" << first << ".." << last << ")"
2458            << dendl;
2459   return true;
2460 }
2461
2462 void OSDMonitor::_prune_update_trimmed(
2463     MonitorDBStore::TransactionRef tx,
2464     version_t first)
2465 {
2466   dout(10) << __func__
2467            << " first " << first
2468            << " last_pinned " << osdmap_manifest.get_last_pinned()
2469            << dendl;
2470
2471   osdmap_manifest_t manifest = osdmap_manifest;
2472
2473   if (!manifest.is_pinned(first)) {
2474     manifest.pin(first);
2475   }
2476
2477   set<version_t>::iterator p_end = manifest.pinned.find(first);
2478   set<version_t>::iterator p = manifest.pinned.begin();
2479   manifest.pinned.erase(p, p_end);
2480   ceph_assert(manifest.get_first_pinned() == first);
2481
2482   if (manifest.get_last_pinned() == first+1 ||
2483       manifest.pinned.size() == 1) {
2484     // we reached the end of the line, as pinned maps go; clean up our
2485     // manifest, and let `should_prune()` decide whether we should prune
2486     // again.
2487     tx->erase(get_service_name(), "osdmap_manifest");
2488     return;
2489   }
2490
2491   bufferlist bl;
2492   manifest.encode(bl);
2493   tx->put(get_service_name(), "osdmap_manifest", bl);
2494 }
2495
2496 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2497 {
2498   dout(1) << __func__ << dendl;
2499
2500   version_t pin_first;
2501
2502   // verify constrainsts on stable in-memory state
2503   if (!has_osdmap_manifest) {
2504     // we must have never pruned, OR if we pruned the state must no longer
2505     // be relevant (i.e., the state must have been removed alongside with
2506     // the trim that *must* have removed past the last pinned map in a
2507     // previous prune).
2508     ceph_assert(osdmap_manifest.pinned.empty());
2509     ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2510     pin_first = get_first_committed();
2511
2512   } else {
2513     // we must have pruned in the past AND its state is still relevant
2514     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2515     // and thus we still hold a manifest in the store).
2516     ceph_assert(!osdmap_manifest.pinned.empty());
2517     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2518     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2519
2520     dout(10) << __func__
2521              << " first_pinned " << osdmap_manifest.get_first_pinned()
2522              << " last_pinned " << osdmap_manifest.get_last_pinned()
2523              << dendl;
2524
2525     pin_first = osdmap_manifest.get_last_pinned();
2526   }
2527
2528   manifest.pin(pin_first);
2529 }
2530
2531 bool OSDMonitor::_prune_sanitize_options() const
2532 {
2533   uint64_t prune_interval =
2534     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2535   uint64_t prune_min =
2536     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2537   uint64_t txsize =
2538     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2539
2540   bool r = true;
2541
2542   if (prune_interval == 0) {
2543     derr << __func__
2544          << " prune is enabled BUT prune interval is zero; abort."
2545          << dendl;
2546     r = false;
2547   } else if (prune_interval == 1) {
2548     derr << __func__
2549          << " prune interval is equal to one, which essentially means"
2550             " no pruning; abort."
2551          << dendl;
2552     r = false;
2553   }
2554   if (prune_min == 0) {
2555     derr << __func__
2556          << " prune is enabled BUT prune min is zero; abort."
2557          << dendl;
2558     r = false;
2559   }
2560   if (prune_interval > prune_min) {
2561     derr << __func__
2562          << " impossible to ascertain proper prune interval because"
2563          << " it is greater than the minimum prune epochs"
2564          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2565          << dendl;
2566     r = false;
2567   }
2568
2569   if (txsize < prune_interval - 1) {
2570     derr << __func__
2571          << " 'mon_osdmap_full_prune_txsize' (" << txsize
2572          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2573          << "); abort." << dendl;
2574     r = false;
2575   }
2576   return r;
2577 }
2578
2579 bool OSDMonitor::is_prune_enabled() const {
2580   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2581 }
2582
2583 bool OSDMonitor::is_prune_supported() const {
2584   return mon.get_required_mon_features().contains_any(
2585       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2586 }
2587
2588 /** do_prune
2589  *
2590  * @returns true if has side-effects; false otherwise.
2591  */
2592 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2593 {
2594   bool enabled = is_prune_enabled();
2595
2596   dout(1) << __func__ << " osdmap full prune "
2597           << ( enabled ? "enabled" : "disabled")
2598           << dendl;
2599
2600   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2601     return false;
2602   }
2603
2604   // we are beyond the minimum prune versions, we need to remove maps because
2605   // otherwise the store will grow unbounded and we may end up having issues
2606   // with available disk space or store hangs.
2607
2608   // we will not pin all versions. We will leave a buffer number of versions.
2609   // this allows us the monitor to trim maps without caring too much about
2610   // pinned maps, and then allow us to use another ceph-mon without these
2611   // capabilities, without having to repair the store.
2612
2613   osdmap_manifest_t manifest = osdmap_manifest;
2614
2615   version_t first = get_first_committed();
2616   version_t last = get_last_committed();
2617
2618   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2619   version_t last_pinned = manifest.get_last_pinned();
2620   uint64_t prune_interval =
2621     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2622   uint64_t txsize =
2623     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2624
2625   prune_init(manifest);
2626
2627   // we need to get rid of some osdmaps
2628
2629   dout(5) << __func__
2630           << " lc (" << first << " .. " << last << ")"
2631           << " last_pinned " << last_pinned
2632           << " interval " << prune_interval
2633           << " last_to_pin " << last_to_pin
2634           << dendl;
2635
2636   // We will be erasing maps as we go.
2637   //
2638   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2639   //
2640   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2641   // we stop pruning. We could prune the maps between `next_to_pin` and
2642   // `last_to_pin`, but by not doing it we end up with neater pruned
2643   // intervals, aligned with `prune_interval`. Besides, this should not be a
2644   // problem as long as `prune_interval` is set to a sane value, instead of
2645   // hundreds or thousands of maps.
2646
2647   auto map_exists = [this](version_t v) {
2648     string k = mon.store->combine_strings("full", v);
2649     return mon.store->exists(get_service_name(), k);
2650   };
2651
2652   // 'interval' represents the number of maps from the last pinned
2653   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2654   // version 11 next; all intermediate versions will be removed.
2655   //
2656   // 'txsize' represents the maximum number of versions we'll be removing in
2657   // this iteration. If 'txsize' is large enough to perform multiple passes
2658   // pinning and removing maps, we will do so; if not, we'll do at least one
2659   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2660   // ensure that we never go *over* the maximum.
2661
2662   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2663   uint64_t removal_interval = prune_interval - 1;
2664
2665   if (txsize < removal_interval) {
2666     dout(5) << __func__
2667             << " setting txsize to removal interval size ("
2668             << removal_interval << " versions"
2669             << dendl;
2670     txsize = removal_interval;
2671   }
2672   ceph_assert(removal_interval > 0);
2673
2674   uint64_t num_pruned = 0;
2675   while (num_pruned + removal_interval <= txsize) {
2676     last_pinned = manifest.get_last_pinned();
2677
2678     if (last_pinned + prune_interval > last_to_pin) {
2679       break;
2680     }
2681     ceph_assert(last_pinned < last_to_pin);
2682
2683     version_t next_pinned = last_pinned + prune_interval;
2684     ceph_assert(next_pinned <= last_to_pin);
2685     manifest.pin(next_pinned);
2686
2687     dout(20) << __func__
2688              << " last_pinned " << last_pinned
2689              << " next_pinned " << next_pinned
2690              << " num_pruned " << num_pruned
2691              << " removal interval (" << (last_pinned+1)
2692              << ".." << (next_pinned-1) << ")"
2693              << " txsize " << txsize << dendl;
2694
2695     ceph_assert(map_exists(last_pinned));
2696     ceph_assert(map_exists(next_pinned));
2697
2698     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2699       ceph_assert(!manifest.is_pinned(v));
2700
2701       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2702       string full_key = mon.store->combine_strings("full", v);
2703       tx->erase(get_service_name(), full_key);
2704       ++num_pruned;
2705     }
2706   }
2707
2708   ceph_assert(num_pruned > 0);
2709
2710   bufferlist bl;
2711   manifest.encode(bl);
2712   tx->put(get_service_name(), "osdmap_manifest", bl);
2713
2714   return true;
2715 }
2716
2717
2718 // -------------
2719
2720 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2721 {
2722   op->mark_osdmon_event(__func__);
2723   Message *m = op->get_req();
2724   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2725
2726   switch (m->get_type()) {
2727     // READs
2728   case MSG_MON_COMMAND:
2729     try {
2730       return preprocess_command(op);
2731     } catch (const bad_cmd_get& e) {
2732       bufferlist bl;
2733       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2734       return true;
2735     }
2736   case CEPH_MSG_MON_GET_OSDMAP:
2737     return preprocess_get_osdmap(op);
2738
2739     // damp updates
2740   case MSG_OSD_MARK_ME_DOWN:
2741     return preprocess_mark_me_down(op);
2742   case MSG_OSD_MARK_ME_DEAD:
2743     return preprocess_mark_me_dead(op);
2744   case MSG_OSD_FULL:
2745     return preprocess_full(op);
2746   case MSG_OSD_FAILURE:
2747     return preprocess_failure(op);
2748   case MSG_OSD_BOOT:
2749     return preprocess_boot(op);
2750   case MSG_OSD_ALIVE:
2751     return preprocess_alive(op);
2752   case MSG_OSD_PG_CREATED:
2753     return preprocess_pg_created(op);
2754   case MSG_OSD_PG_READY_TO_MERGE:
2755     return preprocess_pg_ready_to_merge(op);
2756   case MSG_OSD_PGTEMP:
2757     return preprocess_pgtemp(op);
2758   case MSG_OSD_BEACON:
2759     return preprocess_beacon(op);
2760
2761   case CEPH_MSG_POOLOP:
2762     return preprocess_pool_op(op);
2763
2764   case MSG_REMOVE_SNAPS:
2765     return preprocess_remove_snaps(op);
2766
2767   case MSG_MON_GET_PURGED_SNAPS:
2768     return preprocess_get_purged_snaps(op);
2769
2770   default:
2771     ceph_abort();
2772     return false;
2773   }
2774 }
2775
2776 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2777 {
2778   op->mark_osdmon_event(__func__);
2779   Message *m = op->get_req();
2780   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2781
2782   switch (m->get_type()) {
2783     // damp updates
2784   case MSG_OSD_MARK_ME_DOWN:
2785     return prepare_mark_me_down(op);
2786   case MSG_OSD_MARK_ME_DEAD:
2787     return prepare_mark_me_dead(op);
2788   case MSG_OSD_FULL:
2789     return prepare_full(op);
2790   case MSG_OSD_FAILURE:
2791     return prepare_failure(op);
2792   case MSG_OSD_BOOT:
2793     return prepare_boot(op);
2794   case MSG_OSD_ALIVE:
2795     return prepare_alive(op);
2796   case MSG_OSD_PG_CREATED:
2797     return prepare_pg_created(op);
2798   case MSG_OSD_PGTEMP:
2799     return prepare_pgtemp(op);
2800   case MSG_OSD_PG_READY_TO_MERGE:
2801     return prepare_pg_ready_to_merge(op);
2802   case MSG_OSD_BEACON:
2803     return prepare_beacon(op);
2804
2805   case MSG_MON_COMMAND:
2806     try {
2807       return prepare_command(op);
2808     } catch (const bad_cmd_get& e) {
2809       bufferlist bl;
2810       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2811       return false; /* nothing to propose */
2812     }
2813
2814   case CEPH_MSG_POOLOP:
2815     return prepare_pool_op(op);
2816
2817   case MSG_REMOVE_SNAPS:
2818     return prepare_remove_snaps(op);
2819
2820
2821   default:
2822     ceph_abort();
2823   }
2824
2825   return false;
2826 }
2827
2828 bool OSDMonitor::should_propose(double& delay)
2829 {
2830   dout(10) << "should_propose" << dendl;
2831
2832   // if full map, propose immediately!  any subsequent changes will be clobbered.
2833   if (pending_inc.fullmap.length())
2834     return true;
2835
2836   // adjust osd weights?
2837   if (!osd_weight.empty() &&
2838       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2839     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2840     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2841     delay = 0.0;
2842     osd_weight.clear();
2843     return true;
2844   }
2845
2846   return PaxosService::should_propose(delay);
2847 }
2848
2849
2850
2851 // ---------------------------
2852 // READs
2853
2854 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2855 {
2856   op->mark_osdmon_event(__func__);
2857   auto m = op->get_req<MMonGetOSDMap>();
2858
2859   uint64_t features = mon.get_quorum_con_features();
2860   if (op->get_session() && op->get_session()->con_features)
2861     features = op->get_session()->con_features;
2862
2863   dout(10) << __func__ << " " << *m << dendl;
2864   MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2865   epoch_t first = get_first_committed();
2866   epoch_t last = osdmap.get_epoch();
2867   int max = g_conf()->osd_map_message_max;
2868   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2869   for (epoch_t e = std::max(first, m->get_full_first());
2870        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2871        ++e, --max) {
2872     bufferlist& bl = reply->maps[e];
2873     int r = get_version_full(e, features, bl);
2874     ceph_assert(r >= 0);
2875     max_bytes -= bl.length();
2876   }
2877   for (epoch_t e = std::max(first, m->get_inc_first());
2878        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2879        ++e, --max) {
2880     bufferlist& bl = reply->incremental_maps[e];
2881     int r = get_version(e, features, bl);
2882     ceph_assert(r >= 0);
2883     max_bytes -= bl.length();
2884   }
2885   reply->cluster_osdmap_trim_lower_bound = first;
2886   reply->newest_map = last;
2887   mon.send_reply(op, reply);
2888   return true;
2889 }
2890
2891
2892 // ---------------------------
2893 // UPDATEs
2894
2895 // failure --
2896
2897 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2898   // check permissions
2899   MonSession *session = op->get_session();
2900   if (!session)
2901     return true;
2902   if (!session->is_capable("osd", MON_CAP_X)) {
2903     dout(0) << "got MOSDFailure from entity with insufficient caps "
2904             << session->caps << dendl;
2905     return true;
2906   }
2907   if (fsid != mon.monmap->fsid) {
2908     dout(0) << "check_source: on fsid " << fsid
2909             << " != " << mon.monmap->fsid << dendl;
2910     return true;
2911   }
2912   return false;
2913 }
2914
2915
2916 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2917 {
2918   op->mark_osdmon_event(__func__);
2919   auto m = op->get_req<MOSDFailure>();
2920   // who is target_osd
2921   int badboy = m->get_target_osd();
2922
2923   // check permissions
2924   if (check_source(op, m->fsid))
2925     goto didit;
2926
2927   // first, verify the reporting host is valid
2928   if (m->get_orig_source().is_osd()) {
2929     int from = m->get_orig_source().num();
2930     if (!osdmap.exists(from) ||
2931         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2932         (osdmap.is_down(from) && m->if_osd_failed())) {
2933       dout(5) << "preprocess_failure from dead osd." << from
2934               << ", ignoring" << dendl;
2935       send_incremental(op, m->get_epoch()+1);
2936       goto didit;
2937     }
2938   }
2939
2940
2941   // weird?
2942   if (osdmap.is_down(badboy)) {
2943     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2944             << " " << m->get_target_addrs()
2945             << ", from " << m->get_orig_source() << dendl;
2946     if (m->get_epoch() < osdmap.get_epoch())
2947       send_incremental(op, m->get_epoch()+1);
2948     goto didit;
2949   }
2950   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2951     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2952             << " " << m->get_target_addrs()
2953             << " != map's " << osdmap.get_addrs(badboy)
2954             << ", from " << m->get_orig_source() << dendl;
2955     if (m->get_epoch() < osdmap.get_epoch())
2956       send_incremental(op, m->get_epoch()+1);
2957     goto didit;
2958   }
2959
2960   // already reported?
2961   if (osdmap.is_down(badboy) ||
2962       osdmap.get_up_from(badboy) > m->get_epoch()) {
2963     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2964             << " " << m->get_target_addrs()
2965             << ", from " << m->get_orig_source() << dendl;
2966     if (m->get_epoch() < osdmap.get_epoch())
2967       send_incremental(op, m->get_epoch()+1);
2968     goto didit;
2969   }
2970
2971   if (!can_mark_down(badboy)) {
2972     dout(5) << "preprocess_failure ignoring report of osd."
2973             << m->get_target_osd() << " " << m->get_target_addrs()
2974             << " from " << m->get_orig_source() << dendl;
2975     goto didit;
2976   }
2977
2978   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2979            << " " << m->get_target_addrs()
2980            << ", from " << m->get_orig_source() << dendl;
2981   return false;
2982
2983  didit:
2984   mon.no_reply(op);
2985   return true;
2986 }
2987
2988 class C_AckMarkedDown : public C_MonOp {
2989   OSDMonitor *osdmon;
2990 public:
2991   C_AckMarkedDown(
2992     OSDMonitor *osdmon,
2993     MonOpRequestRef op)
2994     : C_MonOp(op), osdmon(osdmon) {}
2995
2996   void _finish(int r) override {
2997     if (r == 0) {
2998       auto m = op->get_req<MOSDMarkMeDown>();
2999       osdmon->mon.send_reply(
3000         op,
3001         new MOSDMarkMeDown(
3002           m->fsid,
3003           m->target_osd,
3004           m->target_addrs,
3005           m->get_epoch(),
3006           false));   // ACK itself does not request an ack
3007     } else if (r == -EAGAIN) {
3008         osdmon->dispatch(op);
3009     } else {
3010         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3011     }
3012   }
3013   ~C_AckMarkedDown() override {
3014   }
3015 };
3016
3017 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3018 {
3019   op->mark_osdmon_event(__func__);
3020   auto m = op->get_req<MOSDMarkMeDown>();
3021   int from = m->target_osd;
3022
3023   // check permissions
3024   if (check_source(op, m->fsid))
3025     goto reply;
3026
3027   // first, verify the reporting host is valid
3028   if (!m->get_orig_source().is_osd())
3029     goto reply;
3030
3031   if (!osdmap.exists(from) ||
3032       osdmap.is_down(from) ||
3033       osdmap.get_addrs(from) != m->target_addrs) {
3034     dout(5) << "preprocess_mark_me_down from dead osd."
3035             << from << ", ignoring" << dendl;
3036     send_incremental(op, m->get_epoch()+1);
3037     goto reply;
3038   }
3039
3040   // no down might be set
3041   if (!can_mark_down(from))
3042     goto reply;
3043
3044   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3045            << " " << m->target_addrs << dendl;
3046   return false;
3047
3048  reply:
3049   if (m->request_ack) {
3050     Context *c(new C_AckMarkedDown(this, op));
3051     c->complete(0);
3052   }
3053   return true;
3054 }
3055
3056 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3057 {
3058   op->mark_osdmon_event(__func__);
3059   auto m = op->get_req<MOSDMarkMeDown>();
3060   int target_osd = m->target_osd;
3061
3062   ceph_assert(osdmap.is_up(target_osd));
3063   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3064
3065   mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3066   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3067   if (m->down_and_dead) {
3068     if (!pending_inc.new_xinfo.count(target_osd)) {
3069       pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3070     }
3071     pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3072   }
3073   if (m->request_ack)
3074     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3075   return true;
3076 }
3077
3078 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3079 {
3080   op->mark_osdmon_event(__func__);
3081   auto m = op->get_req<MOSDMarkMeDead>();
3082   int from = m->target_osd;
3083
3084   // check permissions
3085   if (check_source(op, m->fsid)) {
3086     mon.no_reply(op);
3087     return true;
3088   }
3089
3090   // first, verify the reporting host is valid
3091   if (!m->get_orig_source().is_osd()) {
3092     mon.no_reply(op);
3093     return true;
3094   }
3095
3096   if (!osdmap.exists(from) ||
3097       !osdmap.is_down(from)) {
3098     dout(5) << __func__ << " from nonexistent or up osd." << from
3099             << ", ignoring" << dendl;
3100     send_incremental(op, m->get_epoch()+1);
3101     mon.no_reply(op);
3102     return true;
3103   }
3104
3105   return false;
3106 }
3107
3108 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3109 {
3110   op->mark_osdmon_event(__func__);
3111   auto m = op->get_req<MOSDMarkMeDead>();
3112   int target_osd = m->target_osd;
3113
3114   ceph_assert(osdmap.is_down(target_osd));
3115
3116   mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3117                     << m->get_epoch();
3118   if (!pending_inc.new_xinfo.count(target_osd)) {
3119     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3120   }
3121   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3122   wait_for_finished_proposal(
3123     op,
3124     new LambdaContext(
3125       [op, this] (int r) {
3126         if (r >= 0) {
3127           mon.no_reply(op);       // ignore on success
3128         }
3129       }
3130       ));
3131   return true;
3132 }
3133
3134 bool OSDMonitor::can_mark_down(int i)
3135 {
3136   if (osdmap.is_nodown(i)) {
3137     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3138             << "will not mark it down" << dendl;
3139     return false;
3140   }
3141
3142   int num_osds = osdmap.get_num_osds();
3143   if (num_osds == 0) {
3144     dout(5) << __func__ << " no osds" << dendl;
3145     return false;
3146   }
3147   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3148   float up_ratio = (float)up / (float)num_osds;
3149   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3150     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3151             << g_conf()->mon_osd_min_up_ratio
3152             << ", will not mark osd." << i << " down" << dendl;
3153     return false;
3154   }
3155   return true;
3156 }
3157
3158 bool OSDMonitor::can_mark_up(int i)
3159 {
3160   if (osdmap.is_noup(i)) {
3161     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3162             << "will not mark it up" << dendl;
3163     return false;
3164   }
3165
3166   return true;
3167 }
3168
3169 /**
3170  * @note the parameter @p i apparently only exists here so we can output the
3171  *       osd's id on messages.
3172  */
3173 bool OSDMonitor::can_mark_out(int i)
3174 {
3175   if (osdmap.is_noout(i)) {
3176     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3177             << "will not mark it out" << dendl;
3178     return false;
3179   }
3180
3181   int num_osds = osdmap.get_num_osds();
3182   if (num_osds == 0) {
3183     dout(5) << __func__ << " no osds" << dendl;
3184     return false;
3185   }
3186   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3187   float in_ratio = (float)in / (float)num_osds;
3188   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3189     if (i >= 0)
3190       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191               << g_conf()->mon_osd_min_in_ratio
3192               << ", will not mark osd." << i << " out" << dendl;
3193     else
3194       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3195               << g_conf()->mon_osd_min_in_ratio
3196               << ", will not mark osds out" << dendl;
3197     return false;
3198   }
3199
3200   return true;
3201 }
3202
3203 bool OSDMonitor::can_mark_in(int i)
3204 {
3205   if (osdmap.is_noin(i)) {
3206     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3207             << "will not mark it in" << dendl;
3208     return false;
3209   }
3210
3211   return true;
3212 }
3213
3214 bool OSDMonitor::check_failures(utime_t now)
3215 {
3216   bool found_failure = false;
3217   auto p = failure_info.begin();
3218   while (p != failure_info.end()) {
3219     auto& [target_osd, fi] = *p;
3220     if (can_mark_down(target_osd) &&
3221         check_failure(now, target_osd, fi)) {
3222       found_failure = true;
3223       ++p;
3224     } else if (is_failure_stale(now, fi)) {
3225       dout(10) << " dropping stale failure_info for osd." << target_osd
3226                << " from " << fi.reporters.size() << " reporters"
3227                << dendl;
3228       p = failure_info.erase(p);
3229     } else {
3230       ++p;
3231     }
3232   }
3233   return found_failure;
3234 }
3235
3236 utime_t OSDMonitor::get_grace_time(utime_t now,
3237                                    int target_osd,
3238                                    failure_info_t& fi) const
3239 {
3240   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3241   if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3242     return orig_grace;
3243   }
3244   utime_t grace = orig_grace;
3245   double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3246   double decay_k = ::log(.5) / halflife;
3247
3248   // scale grace period based on historical probability of 'lagginess'
3249   // (false positive failures due to slowness).
3250   const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3251   const utime_t failed_for = now - fi.get_failed_since();
3252   double decay = exp((double)failed_for * decay_k);
3253   dout(20) << " halflife " << halflife << " decay_k " << decay_k
3254            << " failed_for " << failed_for << " decay " << decay << dendl;
3255   double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3256   grace += my_grace;
3257
3258   // consider the peers reporting a failure a proxy for a potential
3259   // 'subcluster' over the overall cluster that is similarly
3260   // laggy.  this is clearly not true in all cases, but will sometimes
3261   // help us localize the grace correction to a subset of the system
3262   // (say, a rack with a bad switch) that is unhappy.
3263   double peer_grace = 0;
3264   for (auto& [reporter, report] : fi.reporters) {
3265     if (osdmap.exists(reporter)) {
3266       const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3267       utime_t elapsed = now - xi.down_stamp;
3268       double decay = exp((double)elapsed * decay_k);
3269       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3270     }
3271   }
3272   peer_grace /= (double)fi.reporters.size();
3273   grace += peer_grace;
3274   dout(10) << " osd." << target_osd << " has "
3275            << fi.reporters.size() << " reporters, "
3276            << grace << " grace (" << orig_grace << " + " << my_grace
3277            << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3278            << dendl;
3279
3280   return grace;
3281 }
3282
3283 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3284 {
3285   // already pending failure?
3286   if (pending_inc.new_state.count(target_osd) &&
3287       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3288     dout(10) << " already pending failure" << dendl;
3289     return true;
3290   }
3291
3292   set<string> reporters_by_subtree;
3293   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3294   ceph_assert(fi.reporters.size());
3295   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3296     // get the parent bucket whose type matches with "reporter_subtree_level".
3297     // fall back to OSD if the level doesn't exist.
3298     if (osdmap.exists(p->first)) {
3299       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3300       if (auto iter = reporter_loc.find(reporter_subtree_level);
3301           iter == reporter_loc.end()) {
3302         reporters_by_subtree.insert("osd." + to_string(p->first));
3303       } else {
3304         reporters_by_subtree.insert(iter->second);
3305       }
3306       ++p;
3307     } else {
3308       fi.cancel_report(p->first);;
3309       p = fi.reporters.erase(p);
3310     }
3311   }
3312   if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3313     return false;
3314   }
3315   const utime_t failed_for = now - fi.get_failed_since();
3316   const utime_t grace = get_grace_time(now, target_osd, fi);
3317   if (failed_for >= grace) {
3318     dout(1) << " we have enough reporters to mark osd." << target_osd
3319             << " down" << dendl;
3320     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3321
3322     mon.clog->info() << "osd." << target_osd << " failed ("
3323                       << osdmap.crush->get_full_location_ordered_string(
3324                         target_osd)
3325                       << ") ("
3326                       << (int)reporters_by_subtree.size()
3327                       << " reporters from different "
3328                       << reporter_subtree_level << " after "
3329                       << failed_for << " >= grace " << grace << ")";
3330     return true;
3331   }
3332   return false;
3333 }
3334
3335 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3336 {
3337   // if it takes too long to either cancel the report to mark the osd down,
3338   // some reporters must have failed to cancel their reports. let's just
3339   // forget these reports.
3340   const utime_t failed_for = now - fi.get_failed_since();
3341   auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3342   auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3343   return failed_for >= (heartbeat_grace + heartbeat_stale);
3344 }
3345
3346 void OSDMonitor::force_failure(int target_osd, int by)
3347 {
3348   // already pending failure?
3349   if (pending_inc.new_state.count(target_osd) &&
3350       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3351     dout(10) << " already pending failure" << dendl;
3352     return;
3353   }
3354
3355   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3356   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3357   if (!pending_inc.new_xinfo.count(target_osd)) {
3358     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3359   }
3360   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3361
3362   mon.clog->info() << "osd." << target_osd << " failed ("
3363                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3364                     << ") (connection refused reported by osd." << by << ")";
3365   return;
3366 }
3367
3368 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3369 {
3370   op->mark_osdmon_event(__func__);
3371   auto m = op->get_req<MOSDFailure>();
3372   dout(1) << "prepare_failure osd." << m->get_target_osd()
3373           << " " << m->get_target_addrs()
3374           << " from " << m->get_orig_source()
3375           << " is reporting failure:" << m->if_osd_failed() << dendl;
3376
3377   int target_osd = m->get_target_osd();
3378   int reporter = m->get_orig_source().num();
3379   ceph_assert(osdmap.is_up(target_osd));
3380   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3381
3382   mon.no_reply(op);
3383
3384   if (m->if_osd_failed()) {
3385     // calculate failure time
3386     utime_t now = ceph_clock_now();
3387     utime_t failed_since =
3388       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3389
3390     // add a report
3391     if (m->is_immediate()) {
3392       mon.clog->debug() << "osd." << m->get_target_osd()
3393                          << " reported immediately failed by "
3394                          << m->get_orig_source();
3395       force_failure(target_osd, reporter);
3396       return true;
3397     }
3398     mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3399                       << m->get_orig_source();
3400
3401     failure_info_t& fi = failure_info[target_osd];
3402     fi.add_report(reporter, failed_since, op);
3403     return check_failure(now, target_osd, fi);
3404   } else {
3405     // remove the report
3406     mon.clog->debug() << "osd." << m->get_target_osd()
3407                        << " failure report canceled by "
3408                        << m->get_orig_source();
3409     if (failure_info.count(target_osd)) {
3410       failure_info_t& fi = failure_info[target_osd];
3411       fi.cancel_report(reporter);
3412       if (fi.reporters.empty()) {
3413         dout(10) << " removing last failure_info for osd." << target_osd
3414                  << dendl;
3415         failure_info.erase(target_osd);
3416       } else {
3417         dout(10) << " failure_info for osd." << target_osd << " now "
3418                  << fi.reporters.size() << " reporters" << dendl;
3419       }
3420     } else {
3421       dout(10) << " no failure_info for osd." << target_osd << dendl;
3422     }
3423   }
3424
3425   return false;
3426 }
3427
3428 void OSDMonitor::process_failures()
3429 {
3430   map<int,failure_info_t>::iterator p = failure_info.begin();
3431   while (p != failure_info.end()) {
3432     if (osdmap.is_up(p->first)) {
3433       ++p;
3434     } else {
3435       dout(10) << "process_failures osd." << p->first << dendl;
3436       list<MonOpRequestRef> ls;
3437       p->second.take_report_messages(ls);
3438       failure_info.erase(p++);
3439
3440       while (!ls.empty()) {
3441         MonOpRequestRef o = ls.front();
3442         if (o) {
3443           o->mark_event(__func__);
3444           MOSDFailure *m = o->get_req<MOSDFailure>();
3445           send_latest(o, m->get_epoch());
3446           mon.no_reply(o);
3447         }
3448         ls.pop_front();
3449       }
3450     }
3451   }
3452 }
3453
3454 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3455 {
3456   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3457
3458   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3459        p != failure_info.end();
3460        ++p) {
3461     p->second.take_report_messages(ls);
3462   }
3463   failure_info.clear();
3464 }
3465
3466 int OSDMonitor::get_grace_interval_threshold()
3467 {
3468   int halflife = g_conf()->mon_osd_laggy_halflife;
3469   // Scale the halflife period (default: 1_hr) by
3470   // a factor (48) to calculate the threshold.
3471   int grace_threshold_factor = 48;
3472   return halflife * grace_threshold_factor;
3473 }
3474
3475 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3476 {
3477   int grace_interval_threshold_secs = get_grace_interval_threshold();
3478   if (last_failed_interval > grace_interval_threshold_secs) {
3479     dout(1) << " last_failed_interval " << last_failed_interval
3480             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3481             << dendl;
3482     return true;
3483   }
3484   return false;
3485 }
3486
3487 void OSDMonitor::set_default_laggy_params(int target_osd)
3488 {
3489   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3490     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3491   }
3492   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3493   xi.down_stamp = pending_inc.modified;
3494   xi.laggy_probability = 0.0;
3495   xi.laggy_interval = 0;
3496   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3497 }
3498
3499
3500 // boot --
3501
3502 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3503 {
3504   op->mark_osdmon_event(__func__);
3505   auto m = op->get_req<MOSDBoot>();
3506   int from = m->get_orig_source_inst().name.num();
3507
3508   // check permissions, ignore if failed (no response expected)
3509   MonSession *session = op->get_session();
3510   if (!session)
3511     goto ignore;
3512   if (!session->is_capable("osd", MON_CAP_X)) {
3513     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3514             << session->caps << dendl;
3515     goto ignore;
3516   }
3517
3518   if (m->sb.cluster_fsid != mon.monmap->fsid) {
3519     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3520             << " != " << mon.monmap->fsid << dendl;
3521     goto ignore;
3522   }
3523
3524   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3525     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3526     goto ignore;
3527   }
3528
3529   ceph_assert(m->get_orig_source_inst().name.is_osd());
3530
3531   // lower bound of N-2
3532   if (!HAVE_FEATURE(m->osd_features, SERVER_PACIFIC)) {
3533     mon.clog->info() << "disallowing boot of OSD "
3534                      << m->get_orig_source_inst()
3535                      << " because the osd lacks CEPH_FEATURE_SERVER_PACIFIC";
3536     goto ignore;
3537   }
3538
3539   // make sure osd versions do not span more than 3 releases
3540   if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3541       osdmap.require_osd_release < ceph_release_t::octopus) {
3542     mon.clog->info() << "disallowing boot of quincy+ OSD "
3543                       << m->get_orig_source_inst()
3544                       << " because require_osd_release < octopus";
3545     goto ignore;
3546   }
3547   if (HAVE_FEATURE(m->osd_features, SERVER_REEF) &&
3548       osdmap.require_osd_release < ceph_release_t::pacific) {
3549     mon.clog->info() << "disallowing boot of reef+ OSD "
3550                       << m->get_orig_source_inst()
3551                       << " because require_osd_release < pacific";
3552     goto ignore;
3553   }
3554
3555   // See crimson/osd/osd.cc: OSD::_send_boot
3556   if (auto type_iter = m->metadata.find("osd_type");
3557       type_iter != m->metadata.end()) {
3558     const auto &otype = type_iter->second;
3559     // m->metadata["osd_type"] must be "crimson", classic doesn't send osd_type
3560     if (otype == "crimson") {
3561       if (!osdmap.get_allow_crimson()) {
3562         mon.clog->info()
3563           << "Disallowing boot of crimson-osd without allow_crimson "
3564           << "OSDMap flag.  Run ceph osd set_allow_crimson to set "
3565           << "allow_crimson flag.  Note that crimson-osd is "
3566           << "considered unstable and may result in crashes or "
3567           << "data loss.  Its usage should be restricted to "
3568           << "testing and development.";
3569         goto ignore;
3570       }
3571     } else {
3572       derr << __func__ << ": osd " << m->get_orig_source_inst()
3573            << " sent non-crimson osd_type field in MOSDBoot: "
3574            << otype
3575            << " -- booting anyway"
3576            << dendl;
3577     }
3578   }
3579
3580   if (osdmap.stretch_mode_enabled &&
3581       !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3582     mon.clog->info() << "disallowing boot of OSD "
3583                       << m->get_orig_source_inst()
3584                       << " because stretch mode is on and OSD lacks support";
3585     goto ignore;
3586   }
3587
3588   // already booted?
3589   if (osdmap.is_up(from) &&
3590       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3591       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3592     // yup.
3593     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3594             << " " << m->get_orig_source_addrs()
3595             << " =~ " << osdmap.get_addrs(from) << dendl;
3596     _booted(op, false);
3597     return true;
3598   }
3599
3600   if (osdmap.exists(from) &&
3601       !osdmap.get_uuid(from).is_zero() &&
3602       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3603     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3604             << " clashes with existing osd: different fsid"
3605             << " (ours: " << osdmap.get_uuid(from)
3606             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3607     goto ignore;
3608   }
3609
3610   if (osdmap.exists(from) &&
3611       osdmap.get_info(from).up_from > m->version &&
3612       osdmap.get_most_recent_addrs(from).legacy_equals(
3613         m->get_orig_source_addrs())) {
3614     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3615     send_latest(op, m->sb.current_epoch+1);
3616     return true;
3617   }
3618
3619   // noup?
3620   if (!can_mark_up(from)) {
3621     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3622     send_latest(op, m->sb.current_epoch+1);
3623     return true;
3624   }
3625
3626   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3627   return false;
3628
3629  ignore:
3630   return true;
3631 }
3632
3633 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3634 {
3635   op->mark_osdmon_event(__func__);
3636   auto m = op->get_req<MOSDBoot>();
3637   dout(7) << __func__ << " from " << m->get_source()
3638           << " sb " << m->sb
3639           << " client_addrs" << m->get_connection()->get_peer_addrs()
3640           << " cluster_addrs " << m->cluster_addrs
3641           << " hb_back_addrs " << m->hb_back_addrs
3642           << " hb_front_addrs " << m->hb_front_addrs
3643           << dendl;
3644
3645   ceph_assert(m->get_orig_source().is_osd());
3646   int from = m->get_orig_source().num();
3647
3648   // does this osd exist?
3649   if (from >= osdmap.get_max_osd()) {
3650     dout(1) << "boot from osd." << from << " >= max_osd "
3651             << osdmap.get_max_osd() << dendl;
3652     return false;
3653   }
3654
3655   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3656   if (pending_inc.new_state.count(from))
3657     oldstate ^= pending_inc.new_state[from];
3658
3659   // already up?  mark down first?
3660   if (osdmap.is_up(from)) {
3661     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3662             << osdmap.get_addrs(from) << dendl;
3663     // preprocess should have caught these;  if not, assert.
3664     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3665                   m->get_orig_source_addrs()) ||
3666                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3667     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3668
3669     if (pending_inc.new_state.count(from) == 0 ||
3670         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3671       // mark previous guy down
3672       pending_inc.new_state[from] = CEPH_OSD_UP;
3673     }
3674     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3675   } else if (pending_inc.new_up_client.count(from)) {
3676     // already prepared, just wait
3677     dout(7) << __func__ << " already prepared, waiting on "
3678             << m->get_orig_source_addr() << dendl;
3679     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3680   } else {
3681     // mark new guy up.
3682     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3683     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3684     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3685     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3686
3687     down_pending_out.erase(from);  // if any
3688
3689     if (m->sb.weight)
3690       osd_weight[from] = m->sb.weight;
3691
3692     // set uuid?
3693     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3694              << dendl;
3695     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3696       // preprocess should have caught this;  if not, assert.
3697       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3698       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3699     }
3700
3701     // fresh osd?
3702     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3703       const osd_info_t& i = osdmap.get_info(from);
3704       if (i.up_from > i.lost_at) {
3705         dout(10) << " fresh osd; marking lost_at too" << dendl;
3706         pending_inc.new_lost[from] = osdmap.get_epoch();
3707       }
3708     }
3709
3710     // metadata
3711     bufferlist osd_metadata;
3712     encode(m->metadata, osd_metadata);
3713     pending_metadata[from] = osd_metadata;
3714     pending_metadata_rm.erase(from);
3715
3716     // adjust last clean unmount epoch?
3717     const osd_info_t& info = osdmap.get_info(from);
3718     dout(10) << " old osd_info: " << info << dendl;
3719     if (m->sb.mounted > info.last_clean_begin ||
3720         (m->sb.mounted == info.last_clean_begin &&
3721          m->sb.clean_thru > info.last_clean_end)) {
3722       epoch_t begin = m->sb.mounted;
3723       epoch_t end = m->sb.clean_thru;
3724
3725       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3726                << "[" << info.last_clean_begin << "," << info.last_clean_end
3727                << ") -> [" << begin << "-" << end << ")"
3728                << dendl;
3729       pending_inc.new_last_clean_interval[from] =
3730         pair<epoch_t,epoch_t>(begin, end);
3731     }
3732
3733     if (pending_inc.new_xinfo.count(from) == 0)
3734       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3735     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3736     if (m->boot_epoch == 0) {
3737       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3738       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3739       dout(10) << " not laggy, new xi " << xi << dendl;
3740     } else {
3741       if (xi.down_stamp.sec()) {
3742         int interval = ceph_clock_now().sec() -
3743           xi.down_stamp.sec();
3744         if (g_conf()->mon_osd_laggy_max_interval &&
3745             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3746           interval =  g_conf()->mon_osd_laggy_max_interval;
3747         }
3748         xi.laggy_interval =
3749           interval * g_conf()->mon_osd_laggy_weight +
3750           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3751       }
3752       xi.laggy_probability =
3753         g_conf()->mon_osd_laggy_weight +
3754         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3755       dout(10) << " laggy, now xi " << xi << dendl;
3756     }
3757
3758     // set features shared by the osd
3759     if (m->osd_features)
3760       xi.features = m->osd_features;
3761     else
3762       xi.features = m->get_connection()->get_features();
3763
3764     // mark in?
3765     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3766          (oldstate & CEPH_OSD_AUTOOUT)) ||
3767         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3768         (g_conf()->mon_osd_auto_mark_in)) {
3769       if (can_mark_in(from)) {
3770         if (xi.old_weight > 0) {
3771           pending_inc.new_weight[from] = xi.old_weight;
3772           xi.old_weight = 0;
3773         } else {
3774           pending_inc.new_weight[from] = CEPH_OSD_IN;
3775         }
3776       } else {
3777         dout(7) << __func__ << " NOIN set, will not mark in "
3778                 << m->get_orig_source_addr() << dendl;
3779       }
3780     }
3781
3782     // wait
3783     wait_for_finished_proposal(op, new C_Booted(this, op));
3784   }
3785   return true;
3786 }
3787
3788 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3789 {
3790   op->mark_osdmon_event(__func__);
3791   auto m = op->get_req<MOSDBoot>();
3792   dout(7) << "_booted " << m->get_orig_source_inst()
3793           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3794
3795   if (logit) {
3796     mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3797                       << " boot";
3798   }
3799
3800   send_latest(op, m->sb.current_epoch+1);
3801 }
3802
3803
3804 // -------------
3805 // full
3806
3807 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3808 {
3809   op->mark_osdmon_event(__func__);
3810   auto m = op->get_req<MOSDFull>();
3811   int from = m->get_orig_source().num();
3812   set<string> state;
3813   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3814
3815   // check permissions, ignore if failed
3816   MonSession *session = op->get_session();
3817   if (!session)
3818     goto ignore;
3819   if (!session->is_capable("osd", MON_CAP_X)) {
3820     dout(0) << "MOSDFull from entity with insufficient privileges:"
3821             << session->caps << dendl;
3822     goto ignore;
3823   }
3824
3825   // ignore a full message from the osd instance that already went down
3826   if (!osdmap.exists(from)) {
3827     dout(7) << __func__ << " ignoring full message from nonexistent "
3828             << m->get_orig_source_inst() << dendl;
3829     goto ignore;
3830   }
3831   if ((!osdmap.is_up(from) &&
3832        osdmap.get_most_recent_addrs(from).legacy_equals(
3833          m->get_orig_source_addrs())) ||
3834       (osdmap.is_up(from) &&
3835        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3836     dout(7) << __func__ << " ignoring full message from down "
3837             << m->get_orig_source_inst() << dendl;
3838     goto ignore;
3839   }
3840
3841   OSDMap::calc_state_set(osdmap.get_state(from), state);
3842
3843   if ((osdmap.get_state(from) & mask) == m->state) {
3844     dout(7) << __func__ << " state already " << state << " for osd." << from
3845             << " " << m->get_orig_source_inst() << dendl;
3846     _reply_map(op, m->version);
3847     goto ignore;
3848   }
3849
3850   dout(10) << __func__ << " want state " << state << " for osd." << from
3851            << " " << m->get_orig_source_inst() << dendl;
3852   return false;
3853
3854  ignore:
3855   return true;
3856 }
3857
3858 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3859 {
3860   op->mark_osdmon_event(__func__);
3861   auto m = op->get_req<MOSDFull>();
3862   const int from = m->get_orig_source().num();
3863
3864   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3865   const unsigned want_state = m->state & mask;  // safety first
3866
3867   unsigned cur_state = osdmap.get_state(from);
3868   auto p = pending_inc.new_state.find(from);
3869   if (p != pending_inc.new_state.end()) {
3870     cur_state ^= p->second;
3871   }
3872   cur_state &= mask;
3873
3874   set<string> want_state_set, cur_state_set;
3875   OSDMap::calc_state_set(want_state, want_state_set);
3876   OSDMap::calc_state_set(cur_state, cur_state_set);
3877
3878   if (cur_state != want_state) {
3879     if (p != pending_inc.new_state.end()) {
3880       p->second &= ~mask;
3881     } else {
3882       pending_inc.new_state[from] = 0;
3883     }
3884     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3885     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3886             << " -> " << want_state_set << dendl;
3887   } else {
3888     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3889             << " = wanted " << want_state_set << ", just waiting" << dendl;
3890   }
3891
3892   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3893   return true;
3894 }
3895
3896 // -------------
3897 // alive
3898
3899 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3900 {
3901   op->mark_osdmon_event(__func__);
3902   auto m = op->get_req<MOSDAlive>();
3903   int from = m->get_orig_source().num();
3904
3905   // check permissions, ignore if failed
3906   MonSession *session = op->get_session();
3907   if (!session)
3908     goto ignore;
3909   if (!session->is_capable("osd", MON_CAP_X)) {
3910     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3911             << session->caps << dendl;
3912     goto ignore;
3913   }
3914
3915   if (!osdmap.is_up(from) ||
3916       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3917     dout(7) << "preprocess_alive ignoring alive message from down "
3918             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3919             << dendl;
3920     goto ignore;
3921   }
3922
3923   if (osdmap.get_up_thru(from) >= m->want) {
3924     // yup.
3925     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3926     _reply_map(op, m->version);
3927     return true;
3928   }
3929
3930   dout(10) << "preprocess_alive want up_thru " << m->want
3931            << " from " << m->get_orig_source_inst() << dendl;
3932   return false;
3933
3934  ignore:
3935   return true;
3936 }
3937
3938 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3939 {
3940   op->mark_osdmon_event(__func__);
3941   auto m = op->get_req<MOSDAlive>();
3942   int from = m->get_orig_source().num();
3943
3944   if (0) {  // we probably don't care much about these
3945     mon.clog->debug() << m->get_orig_source_inst() << " alive";
3946   }
3947
3948   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3949           << " from " << m->get_orig_source_inst() << dendl;
3950
3951   update_up_thru(from, m->version); // set to the latest map the OSD has
3952   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3953   return true;
3954 }
3955
3956 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3957 {
3958   op->mark_osdmon_event(__func__);
3959   dout(7) << "_reply_map " << e
3960           << " from " << op->get_req()->get_orig_source_inst()
3961           << dendl;
3962   send_latest(op, e);
3963 }
3964
3965 // pg_created
3966 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3967 {
3968   op->mark_osdmon_event(__func__);
3969   auto m  = op->get_req<MOSDPGCreated>();
3970   dout(10) << __func__ << " " << *m << dendl;
3971   auto session = op->get_session();
3972   mon.no_reply(op);
3973   if (!session) {
3974     dout(10) << __func__ << ": no monitor session!" << dendl;
3975     return true;
3976   }
3977   if (!session->is_capable("osd", MON_CAP_X)) {
3978     derr << __func__ << " received from entity "
3979          << "with insufficient privileges " << session->caps << dendl;
3980     return true;
3981   }
3982   // always forward the "created!" to the leader
3983   return false;
3984 }
3985
3986 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3987 {
3988   op->mark_osdmon_event(__func__);
3989   auto m = op->get_req<MOSDPGCreated>();
3990   dout(10) << __func__ << " " << *m << dendl;
3991   auto src = m->get_orig_source();
3992   auto from = src.num();
3993   if (!src.is_osd() ||
3994       !mon.osdmon()->osdmap.is_up(from) ||
3995       !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3996         m->get_orig_source_addrs())) {
3997     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3998     return false;
3999   }
4000   pending_created_pgs.push_back(m->pgid);
4001   return true;
4002 }
4003
4004 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
4005 {
4006   op->mark_osdmon_event(__func__);
4007   auto m = op->get_req<MOSDPGReadyToMerge>();
4008   dout(10) << __func__ << " " << *m << dendl;
4009   const pg_pool_t *pi;
4010   auto session = op->get_session();
4011   if (!session) {
4012     dout(10) << __func__ << ": no monitor session!" << dendl;
4013     goto ignore;
4014   }
4015   if (!session->is_capable("osd", MON_CAP_X)) {
4016     derr << __func__ << " received from entity "
4017          << "with insufficient privileges " << session->caps << dendl;
4018     goto ignore;
4019   }
4020   pi = osdmap.get_pg_pool(m->pgid.pool());
4021   if (!pi) {
4022     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
4023     goto ignore;
4024   }
4025   if (pi->get_pg_num() <= m->pgid.ps()) {
4026     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
4027     goto ignore;
4028   }
4029   if (pi->get_pg_num() != m->pgid.ps() + 1) {
4030     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4031     goto ignore;
4032   }
4033   if (pi->get_pg_num_pending() > m->pgid.ps()) {
4034     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4035     goto ignore;
4036   }
4037   return false;
4038
4039  ignore:
4040   mon.no_reply(op);
4041   return true;
4042 }
4043
4044 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4045 {
4046   op->mark_osdmon_event(__func__);
4047   auto m  = op->get_req<MOSDPGReadyToMerge>();
4048   dout(10) << __func__ << " " << *m << dendl;
4049   pg_pool_t p;
4050   if (pending_inc.new_pools.count(m->pgid.pool()))
4051     p = pending_inc.new_pools[m->pgid.pool()];
4052   else
4053     p = *osdmap.get_pg_pool(m->pgid.pool());
4054   if (p.get_pg_num() != m->pgid.ps() + 1 ||
4055       p.get_pg_num_pending() > m->pgid.ps()) {
4056     dout(10) << __func__
4057              << " race with concurrent pg_num[_pending] update, will retry"
4058              << dendl;
4059     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4060     return false; /* nothing to propose, yet */
4061   }
4062
4063   if (m->ready) {
4064     p.dec_pg_num(m->pgid,
4065                  pending_inc.epoch,
4066                  m->source_version,
4067                  m->target_version,
4068                  m->last_epoch_started,
4069                  m->last_epoch_clean);
4070     p.last_change = pending_inc.epoch;
4071   } else {
4072     // back off the merge attempt!
4073     p.set_pg_num_pending(p.get_pg_num());
4074   }
4075
4076   // force pre-nautilus clients to resend their ops, since they
4077   // don't understand pg_num_pending changes form a new interval
4078   p.last_force_op_resend_prenautilus = pending_inc.epoch;
4079
4080   pending_inc.new_pools[m->pgid.pool()] = p;
4081
4082   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4083   if (m->ready &&
4084       prob > 0 &&
4085       prob > (double)(rand() % 1000)/1000.0) {
4086     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4087     auto n = new MMonCommand(mon.monmap->get_fsid());
4088     n->set_connection(m->get_connection());
4089     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4090                osdmap.get_pool_name(m->pgid.pool()) +
4091                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4092                stringify(m->pgid.ps() + 1) + "\"}" };
4093     MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4094     nop->set_type_service();
4095     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4096   } else {
4097     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4098   }
4099   return true;
4100 }
4101
4102
4103 // -------------
4104 // pg_temp changes
4105
4106 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4107 {
4108   auto m = op->get_req<MOSDPGTemp>();
4109   dout(10) << "preprocess_pgtemp " << *m << dendl;
4110   mempool::osdmap::vector<int> empty;
4111   int from = m->get_orig_source().num();
4112   size_t ignore_cnt = 0;
4113
4114   // check caps
4115   MonSession *session = op->get_session();
4116   if (!session)
4117     goto ignore;
4118   if (!session->is_capable("osd", MON_CAP_X)) {
4119     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4120             << session->caps << dendl;
4121     goto ignore;
4122   }
4123
4124   if (!osdmap.is_up(from) ||
4125       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4126     dout(7) << "ignoring pgtemp message from down "
4127             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4128             << dendl;
4129     goto ignore;
4130   }
4131
4132   if (m->forced) {
4133     return false;
4134   }
4135
4136   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4137     dout(20) << " " << p->first
4138              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4139              << " -> " << p->second << dendl;
4140
4141     // does the pool exist?
4142     if (!osdmap.have_pg_pool(p->first.pool())) {
4143       /*
4144        * 1. If the osdmap does not have the pool, it means the pool has been
4145        *    removed in-between the osd sending this message and us handling it.
4146        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4147        *    not exist in the pending either, as the osds would not send a
4148        *    message about a pool they know nothing about (yet).
4149        * 3. However, if the pool does exist in the pending, then it must be a
4150        *    new pool, and not relevant to this message (see 1).
4151        */
4152       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4153                << ": pool has been removed" << dendl;
4154       ignore_cnt++;
4155       continue;
4156     }
4157
4158     int acting_primary = -1;
4159     osdmap.pg_to_up_acting_osds(
4160       p->first, nullptr, nullptr, nullptr, &acting_primary);
4161     if (acting_primary != from) {
4162       /* If the source isn't the primary based on the current osdmap, we know
4163        * that the interval changed and that we can discard this message.
4164        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4165        * which of two pg temp mappings on the same pg is more recent.
4166        */
4167       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4168                << ": primary has changed" << dendl;
4169       ignore_cnt++;
4170       continue;
4171     }
4172
4173     // removal?
4174     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4175                               osdmap.primary_temp->count(p->first)))
4176       return false;
4177     // change?
4178     //  NOTE: we assume that this will clear pg_primary, so consider
4179     //        an existing pg_primary field to imply a change
4180     if (p->second.size() &&
4181         (osdmap.pg_temp->count(p->first) == 0 ||
4182          osdmap.pg_temp->get(p->first) != p->second ||
4183          osdmap.primary_temp->count(p->first)))
4184       return false;
4185   }
4186
4187   // should we ignore all the pgs?
4188   if (ignore_cnt == m->pg_temp.size())
4189     goto ignore;
4190
4191   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4192   _reply_map(op, m->map_epoch);
4193   return true;
4194
4195  ignore:
4196   mon.no_reply(op);
4197   return true;
4198 }
4199
4200 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4201 {
4202   epoch_t old_up_thru = osdmap.get_up_thru(from);
4203   auto ut = pending_inc.new_up_thru.find(from);
4204   if (ut != pending_inc.new_up_thru.end()) {
4205     old_up_thru = ut->second;
4206   }
4207   if (up_thru > old_up_thru) {
4208     // set up_thru too, so the osd doesn't have to ask again
4209     pending_inc.new_up_thru[from] = up_thru;
4210   }
4211 }
4212
4213 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4214 {
4215   op->mark_osdmon_event(__func__);
4216   auto m = op->get_req<MOSDPGTemp>();
4217   int from = m->get_orig_source().num();
4218   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4219   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4220     uint64_t pool = p->first.pool();
4221     if (pending_inc.old_pools.count(pool)) {
4222       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4223                << ": pool pending removal" << dendl;
4224       continue;
4225     }
4226     if (!osdmap.have_pg_pool(pool)) {
4227       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4228                << ": pool has been removed" << dendl;
4229       continue;
4230     }
4231     pending_inc.new_pg_temp[p->first] =
4232       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4233
4234     // unconditionally clear pg_primary (until this message can encode
4235     // a change for that, too.. at which point we need to also fix
4236     // preprocess_pg_temp)
4237     if (osdmap.primary_temp->count(p->first) ||
4238         pending_inc.new_primary_temp.count(p->first))
4239       pending_inc.new_primary_temp[p->first] = -1;
4240   }
4241
4242   // set up_thru too, so the osd doesn't have to ask again
4243   update_up_thru(from, m->map_epoch);
4244
4245   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4246   return true;
4247 }
4248
4249
4250 // ---
4251
4252 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4253 {
4254   op->mark_osdmon_event(__func__);
4255   auto m = op->get_req<MRemoveSnaps>();
4256   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4257
4258   // check privilege, ignore if failed
4259   MonSession *session = op->get_session();
4260   mon.no_reply(op);
4261   if (!session)
4262     goto ignore;
4263   if (!session->caps.is_capable(
4264         cct,
4265         session->entity_name,
4266         "osd", "osd pool rmsnap", {}, true, true, false,
4267         session->get_peer_socket_addr())) {
4268     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4269             << session->caps << dendl;
4270     goto ignore;
4271   }
4272
4273   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4274        q != m->snaps.end();
4275        ++q) {
4276     if (!osdmap.have_pg_pool(q->first)) {
4277       dout(10) << " ignoring removed_snaps " << q->second
4278                << " on non-existent pool " << q->first << dendl;
4279       continue;
4280     }
4281     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4282     for (vector<snapid_t>::iterator p = q->second.begin();
4283          p != q->second.end();
4284          ++p) {
4285       if (*p > pi->get_snap_seq() ||
4286           !_is_removed_snap(q->first, *p)) {
4287         return false;
4288       }
4289     }
4290   }
4291
4292   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4293     auto reply = make_message<MRemoveSnaps>();
4294     reply->snaps = m->snaps;
4295     mon.send_reply(op, reply.detach());
4296   }
4297
4298  ignore:
4299   return true;
4300 }
4301
4302 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4303 {
4304   op->mark_osdmon_event(__func__);
4305   auto m = op->get_req<MRemoveSnaps>();
4306   dout(7) << "prepare_remove_snaps " << *m << dendl;
4307
4308   for (auto& [pool, snaps] : m->snaps) {
4309     if (!osdmap.have_pg_pool(pool)) {
4310       dout(10) << " ignoring removed_snaps " << snaps
4311                << " on non-existent pool " << pool << dendl;
4312       continue;
4313     }
4314
4315     pg_pool_t& pi = osdmap.pools[pool];
4316     for (auto s : snaps) {
4317       if (!_is_removed_snap(pool, s) &&
4318           (!pending_inc.new_pools.count(pool) ||
4319            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4320           (!pending_inc.new_removed_snaps.count(pool) ||
4321            !pending_inc.new_removed_snaps[pool].contains(s))) {
4322         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4323         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4324           newpi->removed_snaps.insert(s);
4325           dout(10) << " pool " << pool << " removed_snaps added " << s
4326                    << " (now " << newpi->removed_snaps << ")" << dendl;
4327         }
4328         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4329         if (s > newpi->get_snap_seq()) {
4330           dout(10) << " pool " << pool << " snap_seq "
4331                    << newpi->get_snap_seq() << " -> " << s << dendl;
4332           newpi->set_snap_seq(s);
4333         }
4334         newpi->set_snap_epoch(pending_inc.epoch);
4335         dout(10) << " added pool " << pool << " snap " << s
4336                  << " to removed_snaps queue" << dendl;
4337         pending_inc.new_removed_snaps[pool].insert(s);
4338       }
4339     }
4340   }
4341
4342   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4343     auto reply = make_message<MRemoveSnaps>();
4344     reply->snaps = m->snaps;
4345     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4346   }
4347
4348   return true;
4349 }
4350
4351 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4352 {
4353   op->mark_osdmon_event(__func__);
4354   auto m = op->get_req<MMonGetPurgedSnaps>();
4355   dout(7) << __func__ << " " << *m << dendl;
4356
4357   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4358
4359   string k = make_purged_snap_epoch_key(m->start);
4360   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4361   it->upper_bound(k);
4362   unsigned long epoch = m->last;
4363   while (it->valid()) {
4364     if (it->key().find("purged_epoch_") != 0) {
4365       break;
4366     }
4367     string k = it->key();
4368     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4369     if (n != 1) {
4370       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4371     } else if (epoch > m->last) {
4372       break;
4373     } else {
4374       bufferlist bl = it->value();
4375       auto p = bl.cbegin();
4376       auto &v = r[epoch];
4377       try {
4378         ceph::decode(v, p);
4379       } catch (ceph::buffer::error& e) {
4380         derr << __func__ << " unable to parse value for key '" << it->key()
4381              << "': \n";
4382         bl.hexdump(*_dout);
4383         *_dout << dendl;
4384       }
4385       n += 4 + v.size() * 16;
4386     }
4387     if (n > 1048576) {
4388       // impose a semi-arbitrary limit to message size
4389       break;
4390     }
4391     it->next();
4392   }
4393
4394   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4395   reply->purged_snaps.swap(r);
4396   mon.send_reply(op, reply.detach());
4397
4398   return true;
4399 }
4400
4401 // osd beacon
4402 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4403 {
4404   op->mark_osdmon_event(__func__);
4405   // check caps
4406   auto session = op->get_session();
4407   mon.no_reply(op);
4408   if (!session) {
4409     dout(10) << __func__ << " no monitor session!" << dendl;
4410     return true;
4411   }
4412   if (!session->is_capable("osd", MON_CAP_X)) {
4413     derr << __func__ << " received from entity "
4414          << "with insufficient privileges " << session->caps << dendl;
4415     return true;
4416   }
4417   // Always forward the beacon to the leader, even if they are the same as
4418   // the old one. The leader will mark as down osds that haven't sent
4419   // beacon for a few minutes.
4420   return false;
4421 }
4422
4423 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4424 {
4425   op->mark_osdmon_event(__func__);
4426   const auto beacon = op->get_req<MOSDBeacon>();
4427   const auto src = beacon->get_orig_source();
4428   dout(10) << __func__ << " " << *beacon
4429            << " from " << src << dendl;
4430   int from = src.num();
4431
4432   if (!src.is_osd() ||
4433       !osdmap.is_up(from) ||
4434       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4435     if (src.is_osd() && !osdmap.is_up(from)) {
4436       // share some new maps with this guy in case it may not be
4437       // aware of its own deadness...
4438       send_latest(op, beacon->version+1);
4439     }
4440     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4441     return false; /* nothing to propose */
4442   }
4443
4444   last_osd_report[from].first = ceph_clock_now();
4445   last_osd_report[from].second = beacon->osd_beacon_report_interval;
4446   osd_epochs[from] = beacon->version;
4447
4448   for (const auto& pg : beacon->pgs) {
4449     if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4450       unsigned pg_num = pool->get_pg_num();
4451       last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4452     }
4453   }
4454
4455   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4456       beacon->last_purged_snaps_scrub) {
4457     if (pending_inc.new_xinfo.count(from) == 0) {
4458       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4459     }
4460     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4461       beacon->last_purged_snaps_scrub;
4462     return true;
4463   } else {
4464     return false; /* nothing to propose */
4465   }
4466 }
4467
4468 // ---------------
4469 // map helpers
4470
4471 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4472 {
4473   op->mark_osdmon_event(__func__);
4474   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4475           << " start " << start << dendl;
4476   if (start == 0)
4477     send_full(op);
4478   else
4479     send_incremental(op, start);
4480 }
4481
4482
4483 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4484 {
4485   MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4486   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4487   r->cluster_osdmap_trim_lower_bound = get_first_committed();
4488   r->newest_map = osdmap.get_epoch();
4489   return r;
4490 }
4491
4492 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4493 {
4494   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4495            << std::hex << features << std::dec << dendl;
4496   MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4497   m->cluster_osdmap_trim_lower_bound = get_first_committed();
4498   m->newest_map = osdmap.get_epoch();
4499
4500   for (epoch_t e = to; e >= from && e > 0; e--) {
4501     bufferlist bl;
4502     int err = get_version(e, features, bl);
4503     if (err == 0) {
4504       ceph_assert(bl.length());
4505       // if (get_version(e, bl) > 0) {
4506       dout(20) << "build_incremental    inc " << e << " "
4507                << bl.length() << " bytes" << dendl;
4508       m->incremental_maps[e] = bl;
4509     } else {
4510       ceph_assert(err == -ENOENT);
4511       ceph_assert(!bl.length());
4512       get_version_full(e, features, bl);
4513       if (bl.length() > 0) {
4514       //else if (get_version("full", e, bl) > 0) {
4515       dout(20) << "build_incremental   full " << e << " "
4516                << bl.length() << " bytes" << dendl;
4517       m->maps[e] = bl;
4518       } else {
4519         ceph_abort();  // we should have all maps.
4520       }
4521     }
4522   }
4523   return m;
4524 }
4525
4526 void OSDMonitor::send_full(MonOpRequestRef op)
4527 {
4528   op->mark_osdmon_event(__func__);
4529   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4530   mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4531 }
4532
4533 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4534 {
4535   op->mark_osdmon_event(__func__);
4536
4537   MonSession *s = op->get_session();
4538   ceph_assert(s);
4539
4540   if (s->proxy_con) {
4541     // oh, we can tell the other mon to do it
4542     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4543              << first << dendl;
4544     MRoute *r = new MRoute(s->proxy_tid, NULL);
4545     r->send_osdmap_first = first;
4546     s->proxy_con->send_message(r);
4547     op->mark_event("reply: send routed send_osdmap_first reply");
4548   } else {
4549     // do it ourselves
4550     send_incremental(first, s, false, op);
4551   }
4552 }
4553
4554 void OSDMonitor::send_incremental(epoch_t first,
4555                                   MonSession *session,
4556                                   bool onetime,
4557                                   MonOpRequestRef req)
4558 {
4559   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4560           << " to " << session->name << dendl;
4561
4562   // get feature of the peer
4563   // use quorum_con_features, if it's an anonymous connection.
4564   uint64_t features = session->con_features ? session->con_features :
4565     mon.get_quorum_con_features();
4566
4567   if (first <= session->osd_epoch) {
4568     dout(10) << __func__ << " " << session->name << " should already have epoch "
4569              << session->osd_epoch << dendl;
4570     first = session->osd_epoch + 1;
4571   }
4572
4573   if (first < get_first_committed()) {
4574     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4575     m->cluster_osdmap_trim_lower_bound = get_first_committed();
4576     m->newest_map = osdmap.get_epoch();
4577
4578     first = get_first_committed();
4579     bufferlist bl;
4580     int err = get_version_full(first, features, bl);
4581     ceph_assert(err == 0);
4582     ceph_assert(bl.length());
4583     dout(20) << "send_incremental starting with base full "
4584              << first << " " << bl.length() << " bytes" << dendl;
4585     m->maps[first] = bl;
4586
4587     if (req) {
4588       mon.send_reply(req, m);
4589       session->osd_epoch = first;
4590       return;
4591     } else {
4592       session->con->send_message(m);
4593       session->osd_epoch = first;
4594     }
4595     first++;
4596   }
4597
4598   while (first <= osdmap.get_epoch()) {
4599     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4600                                      osdmap.get_epoch());
4601     MOSDMap *m = build_incremental(first, last, features);
4602
4603     if (req) {
4604       // send some maps.  it may not be all of them, but it will get them
4605       // started.
4606       mon.send_reply(req, m);
4607     } else {
4608       session->con->send_message(m);
4609       first = last + 1;
4610     }
4611     session->osd_epoch = last;
4612     if (onetime || req)
4613       break;
4614   }
4615 }
4616
4617 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4618 {
4619   return get_version(ver, mon.get_quorum_con_features(), bl);
4620 }
4621
4622 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4623 {
4624   OSDMap::Incremental inc;
4625   auto q = bl.cbegin();
4626   inc.decode(q);
4627   // always encode with subset of osdmap's canonical features
4628   uint64_t f = features & inc.encode_features;
4629   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4630            << dendl;
4631   bl.clear();
4632   if (inc.fullmap.length()) {
4633     // embedded full map?
4634     OSDMap m;
4635     m.decode(inc.fullmap);
4636     inc.fullmap.clear();
4637     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4638   }
4639   if (inc.crush.length()) {
4640     // embedded crush map
4641     CrushWrapper c;
4642     auto p = inc.crush.cbegin();
4643     c.decode(p);
4644     inc.crush.clear();
4645     c.encode(inc.crush, f);
4646   }
4647   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4648 }
4649
4650 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4651 {
4652   OSDMap m;
4653   auto q = bl.cbegin();
4654   m.decode(q);
4655   // always encode with subset of osdmap's canonical features
4656   uint64_t f = features & m.get_encoding_features();
4657   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4658            << dendl;
4659   bl.clear();
4660   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4661 }
4662
4663 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4664 {
4665   uint64_t significant_features = OSDMap::get_significant_features(features);
4666   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4667     return 0;
4668   }
4669   int ret = PaxosService::get_version(ver, bl);
4670   if (ret < 0) {
4671     return ret;
4672   }
4673   // NOTE: this check is imprecise; the OSDMap encoding features may
4674   // be a subset of the latest mon quorum features, but worst case we
4675   // reencode once and then cache the (identical) result under both
4676   // feature masks.
4677   if (significant_features !=
4678       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4679     reencode_incremental_map(bl, features);
4680   }
4681   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4682   return 0;
4683 }
4684
4685 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4686 {
4687   bufferlist inc_bl;
4688   int err = get_version(ver, inc_bl);
4689   ceph_assert(err == 0);
4690   ceph_assert(inc_bl.length());
4691
4692   auto p = inc_bl.cbegin();
4693   inc.decode(p);
4694   dout(10) << __func__ << "     "
4695            << " epoch " << inc.epoch
4696            << " inc_crc " << inc.inc_crc
4697            << " full_crc " << inc.full_crc
4698            << " encode_features " << inc.encode_features << dendl;
4699   return 0;
4700 }
4701
4702 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4703 {
4704   dout(10) << __func__ << " ver " << ver << dendl;
4705
4706   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4707   if (closest_pinned == 0) {
4708     return -ENOENT;
4709   }
4710   if (closest_pinned > ver) {
4711     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4712   }
4713   ceph_assert(closest_pinned <= ver);
4714
4715   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4716
4717   // get osdmap incremental maps and apply on top of this one.
4718   bufferlist osdm_bl;
4719   bool has_cached_osdmap = false;
4720   for (version_t v = ver-1; v >= closest_pinned; --v) {
4721     if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4722                                 &osdm_bl)) {
4723       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4724       closest_pinned = v;
4725       has_cached_osdmap = true;
4726       break;
4727     }
4728   }
4729
4730   if (!has_cached_osdmap) {
4731     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4732     if (err != 0) {
4733       derr << __func__ << " closest pinned map ver " << closest_pinned
4734            << " not available! error: " << cpp_strerror(err) << dendl;
4735     }
4736     ceph_assert(err == 0);
4737   }
4738
4739   ceph_assert(osdm_bl.length());
4740
4741   OSDMap osdm;
4742   osdm.decode(osdm_bl);
4743
4744   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4745            << " e" << osdm.epoch
4746            << " crc " << osdm.get_crc()
4747            << " -- applying incremental maps." << dendl;
4748
4749   uint64_t encode_features = 0;
4750   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4751     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4752
4753     OSDMap::Incremental inc;
4754     int err = get_inc(v, inc);
4755     ceph_assert(err == 0);
4756
4757     encode_features = inc.encode_features;
4758
4759     err = osdm.apply_incremental(inc);
4760     ceph_assert(err == 0);
4761
4762     // this block performs paranoid checks on map retrieval
4763     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4764         inc.full_crc != 0) {
4765
4766       uint64_t f = encode_features;
4767       if (!f) {
4768         f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4769       }
4770
4771       // encode osdmap to force calculating crcs
4772       bufferlist tbl;
4773       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4774       // decode osdmap to compare crcs with what's expected by incremental
4775       OSDMap tosdm;
4776       tosdm.decode(tbl);
4777
4778       if (tosdm.get_crc() != inc.full_crc) {
4779         derr << __func__
4780              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4781              << ", expected " << inc.full_crc << ")" << dendl;
4782         ceph_abort_msg("osdmap crc mismatch");
4783       }
4784     }
4785
4786     // note: we cannot add the recently computed map to the cache, as is,
4787     // because we have not encoded the map into a bl.
4788   }
4789
4790   if (!encode_features) {
4791     dout(10) << __func__
4792              << " last incremental map didn't have features;"
4793              << " defaulting to quorum's or all" << dendl;
4794     encode_features =
4795       (mon.quorum_con_features ? mon.quorum_con_features : -1);
4796   }
4797   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4798
4799   return 0;
4800 }
4801
4802 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4803 {
4804   return get_version_full(ver, mon.get_quorum_con_features(), bl);
4805 }
4806
4807 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4808                                  bufferlist& bl)
4809 {
4810   uint64_t significant_features = OSDMap::get_significant_features(features);
4811   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4812     return 0;
4813   }
4814   int ret = PaxosService::get_version_full(ver, bl);
4815   if (ret == -ENOENT) {
4816     // build map?
4817     ret = get_full_from_pinned_map(ver, bl);
4818   }
4819   if (ret < 0) {
4820     return ret;
4821   }
4822   // NOTE: this check is imprecise; the OSDMap encoding features may
4823   // be a subset of the latest mon quorum features, but worst case we
4824   // reencode once and then cache the (identical) result under both
4825   // feature masks.
4826   if (significant_features !=
4827       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4828     reencode_full_map(bl, features);
4829   }
4830   full_osd_cache.add_bytes({ver, significant_features}, bl);
4831   return 0;
4832 }
4833
4834 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4835 {
4836   dout(10) << "blocklist " << av << " until " << until << dendl;
4837   for (auto a : av.v) {
4838     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4839       a.set_type(entity_addr_t::TYPE_ANY);
4840     } else {
4841       a.set_type(entity_addr_t::TYPE_LEGACY);
4842     }
4843     pending_inc.new_blocklist[a] = until;
4844   }
4845   return pending_inc.epoch;
4846 }
4847
4848 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4849 {
4850   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4851     a.set_type(entity_addr_t::TYPE_ANY);
4852   } else {
4853     a.set_type(entity_addr_t::TYPE_LEGACY);
4854   }
4855   dout(10) << "blocklist " << a << " until " << until << dendl;
4856   pending_inc.new_blocklist[a] = until;
4857   return pending_inc.epoch;
4858 }
4859
4860
4861 void OSDMonitor::check_osdmap_subs()
4862 {
4863   dout(10) << __func__ << dendl;
4864   if (!osdmap.get_epoch()) {
4865     return;
4866   }
4867   auto osdmap_subs = mon.session_map.subs.find("osdmap");
4868   if (osdmap_subs == mon.session_map.subs.end()) {
4869     return;
4870   }
4871   auto p = osdmap_subs->second->begin();
4872   while (!p.end()) {
4873     auto sub = *p;
4874     ++p;
4875     check_osdmap_sub(sub);
4876   }
4877 }
4878
4879 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4880 {
4881   dout(10) << __func__ << " " << sub << " next " << sub->next
4882            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4883   if (sub->next <= osdmap.get_epoch()) {
4884     if (sub->next >= 1)
4885       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4886     else
4887       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4888     if (sub->onetime)
4889       mon.session_map.remove_sub(sub);
4890     else
4891       sub->next = osdmap.get_epoch() + 1;
4892   }
4893 }
4894
4895 void OSDMonitor::check_pg_creates_subs()
4896 {
4897   if (!osdmap.get_num_up_osds()) {
4898     return;
4899   }
4900   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4901   mon.with_session_map([this](const MonSessionMap& session_map) {
4902       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4903       if (pg_creates_subs == session_map.subs.end()) {
4904         return;
4905       }
4906       for (auto sub : *pg_creates_subs->second) {
4907         check_pg_creates_sub(sub);
4908       }
4909     });
4910 }
4911
4912 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4913 {
4914   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4915   ceph_assert(sub->type == "osd_pg_creates");
4916   // only send these if the OSD is up.  we will check_subs() when they do
4917   // come up so they will get the creates then.
4918   if (sub->session->name.is_osd() &&
4919       mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4920     sub->next = send_pg_creates(sub->session->name.num(),
4921                                 sub->session->con.get(),
4922                                 sub->next);
4923   }
4924 }
4925
4926 void OSDMonitor::do_application_enable(int64_t pool_id,
4927                                        const std::string &app_name,
4928                                        const std::string &app_key,
4929                                        const std::string &app_value,
4930                                        bool force)
4931 {
4932   ceph_assert(paxos.is_plugged() && is_writeable());
4933
4934   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4935            << dendl;
4936
4937   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4938
4939   auto pp = osdmap.get_pg_pool(pool_id);
4940   ceph_assert(pp != nullptr);
4941
4942   pg_pool_t p = *pp;
4943   if (pending_inc.new_pools.count(pool_id)) {
4944     p = pending_inc.new_pools[pool_id];
4945   }
4946
4947   if (app_key.empty()) {
4948     p.application_metadata.insert({app_name, {}});
4949   } else {
4950     if (force) {
4951       p.application_metadata[app_name][app_key] = app_value;
4952     } else {
4953       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4954     }
4955   }
4956   p.last_change = pending_inc.epoch;
4957   pending_inc.new_pools[pool_id] = p;
4958 }
4959
4960 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4961                                  pool_opts_t::key_t opt,
4962                                  pool_opts_t::value_t val)
4963 {
4964   dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4965            << " val: " << val << dendl;
4966   auto p = pending_inc.new_pools.try_emplace(
4967     pool_id, *osdmap.get_pg_pool(pool_id));
4968   p.first->second.opts.set(opt, val);
4969 }
4970
4971 unsigned OSDMonitor::scan_for_creating_pgs(
4972   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4973   const mempool::osdmap::set<int64_t>& removed_pools,
4974   utime_t modified,
4975   creating_pgs_t* creating_pgs) const
4976 {
4977   unsigned queued = 0;
4978   for (auto& p : pools) {
4979     int64_t poolid = p.first;
4980     if (creating_pgs->created_pools.count(poolid)) {
4981       dout(10) << __func__ << " already created " << poolid << dendl;
4982       continue;
4983     }
4984     const pg_pool_t& pool = p.second;
4985     int ruleno = pool.get_crush_rule();
4986     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4987       continue;
4988
4989     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4990     const auto created = pool.get_last_change();
4991     if (last_scan_epoch && created <= last_scan_epoch) {
4992       dout(10) << __func__ << " no change in pool " << poolid
4993                << " " << pool << dendl;
4994       continue;
4995     }
4996     if (removed_pools.count(poolid)) {
4997       dout(10) << __func__ << " pool is being removed: " << poolid
4998                << " " << pool << dendl;
4999       continue;
5000     }
5001     dout(10) << __func__ << " queueing pool create for " << poolid
5002              << " " << pool << dendl;
5003     creating_pgs->create_pool(poolid, pool.get_pg_num(),
5004                               created, modified);
5005     queued++;
5006   }
5007   return queued;
5008 }
5009
5010 void OSDMonitor::update_creating_pgs()
5011 {
5012   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
5013            << creating_pgs.queue.size() << " pools in queue" << dendl;
5014   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
5015   std::lock_guard<std::mutex> l(creating_pgs_lock);
5016   for (const auto& pg : creating_pgs.pgs) {
5017     int acting_primary = -1;
5018     auto pgid = pg.first;
5019     if (!osdmap.pg_exists(pgid)) {
5020       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
5021                << dendl;
5022       continue;
5023     }
5024     auto mapped = pg.second.create_epoch;
5025     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
5026     spg_t spgid(pgid);
5027     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
5028     // check the previous creating_pgs, look for the target to whom the pg was
5029     // previously mapped
5030     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5031       const auto last_acting_primary = pgs_by_epoch.first;
5032       for (auto& pgs: pgs_by_epoch.second) {
5033         if (pgs.second.count(spgid)) {
5034           if (last_acting_primary == acting_primary) {
5035             mapped = pgs.first;
5036           } else {
5037             dout(20) << __func__ << " " << pgid << " "
5038                      << " acting_primary:" << last_acting_primary
5039                      << " -> " << acting_primary << dendl;
5040             // note epoch if the target of the create message changed.
5041             mapped = mapping.get_epoch();
5042           }
5043           break;
5044         } else {
5045           // newly creating
5046           mapped = mapping.get_epoch();
5047         }
5048       }
5049     }
5050     dout(10) << __func__ << " will instruct osd." << acting_primary
5051              << " to create " << pgid << "@" << mapped << dendl;
5052     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5053   }
5054   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5055   creating_pgs_epoch = mapping.get_epoch();
5056 }
5057
5058 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5059 {
5060   dout(30) << __func__ << " osd." << osd << " next=" << next
5061            << " " << creating_pgs_by_osd_epoch << dendl;
5062   std::lock_guard<std::mutex> l(creating_pgs_lock);
5063   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5064     dout(20) << __func__
5065              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5066     // the subscribers will be updated when the mapping is completed anyway
5067     return next;
5068   }
5069   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5070   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5071     return next;
5072   ceph_assert(!creating_pgs_by_epoch->second.empty());
5073
5074   auto m = make_message<MOSDPGCreate2>(creating_pgs_epoch);
5075
5076   epoch_t last = 0;
5077   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5078        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5079     auto epoch = epoch_pgs->first;
5080     auto& pgs = epoch_pgs->second;
5081     dout(20) << __func__ << " osd." << osd << " from " << next
5082              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5083     last = epoch;
5084     for (auto& pg : pgs) {
5085       // Need the create time from the monitor using its clock to set
5086       // last_scrub_stamp upon pg creation.
5087       auto create = creating_pgs.pgs.find(pg.pgid);
5088       ceph_assert(create != creating_pgs.pgs.end());
5089       m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5090                              create->second.create_stamp));
5091       if (create->second.history.epoch_created) {
5092         dout(20) << __func__ << "   " << pg << " " << create->second.history
5093            << " " << create->second.past_intervals << dendl;
5094         m->pg_extra.emplace(pg, make_pair(create->second.history,
5095                                     create->second.past_intervals));
5096       }
5097       dout(20) << __func__ << " will create " << pg
5098                << " at " << create->second.create_epoch << dendl;
5099     }
5100   }
5101   if (!m->pgs.empty()) {
5102     con->send_message2(std::move(m));
5103   } else {
5104     dout(20) << __func__ << " osd." << osd << " from " << next
5105              << " has nothing to send" << dendl;
5106     return next;
5107   }
5108
5109   // sub is current through last + 1
5110   return last + 1;
5111 }
5112
5113 // TICK
5114
5115
5116 void OSDMonitor::tick()
5117 {
5118   if (!is_active()) return;
5119
5120   dout(10) << osdmap << dendl;
5121
5122   // always update osdmap manifest, regardless of being the leader.
5123   load_osdmap_manifest();
5124
5125   // always tune priority cache manager memory on leader and peons
5126   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5127     std::lock_guard l(balancer_lock);
5128     if (pcm != nullptr) {
5129       pcm->tune_memory();
5130       pcm->balance();
5131       _set_new_cache_sizes();
5132       dout(10) << "tick balancer "
5133                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5134                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5135                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5136                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5137                << dendl;
5138       dout(10) << "tick balancer "
5139                << " full cache_bytes: " << full_cache->get_cache_bytes()
5140                << " full comtd_bytes: " << full_cache->get_committed_size()
5141                << " full used_bytes: " << full_cache->_get_used_bytes()
5142                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5143                << dendl;
5144     }
5145   }
5146
5147   if (!mon.is_leader()) return;
5148
5149   bool do_propose = false;
5150   utime_t now = ceph_clock_now();
5151
5152   if (handle_osd_timeouts(now, last_osd_report)) {
5153     do_propose = true;
5154   }
5155
5156   // mark osds down?
5157   if (check_failures(now)) {
5158     do_propose = true;
5159   }
5160
5161   // Force a proposal if we need to prune; pruning is performed on
5162   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5163   // even if there's nothing going on.
5164   if (is_prune_enabled() && should_prune()) {
5165     do_propose = true;
5166   }
5167
5168   // mark down osds out?
5169
5170   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5171    * influence at all. The decision is made based on the ratio of "in" osds,
5172    * and the function returns false if this ratio is lower that the minimum
5173    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5174    */
5175   if (can_mark_out(-1)) {
5176     string down_out_subtree_limit = g_conf().get_val<string>(
5177       "mon_osd_down_out_subtree_limit");
5178     set<int> down_cache;  // quick cache of down subtrees
5179
5180     map<int,utime_t>::iterator i = down_pending_out.begin();
5181     while (i != down_pending_out.end()) {
5182       int o = i->first;
5183       utime_t down = now;
5184       down -= i->second;
5185       ++i;
5186
5187       if (osdmap.is_down(o) &&
5188           osdmap.is_in(o) &&
5189           can_mark_out(o)) {
5190         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5191         utime_t grace = orig_grace;
5192         double my_grace = 0.0;
5193
5194         if (g_conf()->mon_osd_adjust_down_out_interval) {
5195           // scale grace period the same way we do the heartbeat grace.
5196           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5197           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5198           double decay_k = ::log(.5) / halflife;
5199           double decay = exp((double)down * decay_k);
5200           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5201                    << " down for " << down << " decay " << decay << dendl;
5202           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5203           grace += my_grace;
5204         }
5205
5206         // is this an entire large subtree down?
5207         if (down_out_subtree_limit.length()) {
5208           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5209           if (type > 0) {
5210             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5211               dout(10) << "tick entire containing " << down_out_subtree_limit
5212                        << " subtree for osd." << o
5213                        << " is down; resetting timer" << dendl;
5214               // reset timer, too.
5215               down_pending_out[o] = now;
5216               continue;
5217             }
5218           }
5219         }
5220
5221         bool down_out = !osdmap.is_destroyed(o) &&
5222           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5223         bool destroyed_out = osdmap.is_destroyed(o) &&
5224           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5225         // this is not precise enough as we did not make a note when this osd
5226         // was marked as destroyed, but let's not bother with that
5227         // complexity for now.
5228           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5229         if (down_out || destroyed_out) {
5230           dout(10) << "tick marking osd." << o << " OUT after " << down
5231                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5232           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5233
5234           // set the AUTOOUT bit.
5235           if (pending_inc.new_state.count(o) == 0)
5236             pending_inc.new_state[o] = 0;
5237           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5238
5239           // remember previous weight
5240           if (pending_inc.new_xinfo.count(o) == 0)
5241             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5242           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5243
5244           do_propose = true;
5245
5246           mon.clog->info() << "Marking osd." << o << " out (has been down for "
5247                             << int(down.sec()) << " seconds)";
5248         } else
5249           continue;
5250       }
5251
5252       down_pending_out.erase(o);
5253     }
5254   } else {
5255     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5256   }
5257
5258   // expire blocklisted items?
5259   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5260        p != osdmap.blocklist.end();
5261        ++p) {
5262     if (p->second < now) {
5263       dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5264       pending_inc.old_blocklist.push_back(p->first);
5265       do_propose = true;
5266     }
5267   }
5268   for (auto p = osdmap.range_blocklist.begin();
5269        p != osdmap.range_blocklist.end();
5270        ++p) {
5271     if (p->second < now) {
5272       dout(10) << "expiring range_blocklist item " << p->first
5273                << " expired " << p->second << " < now " << now << dendl;
5274       pending_inc.old_range_blocklist.push_back(p->first);
5275       do_propose = true;
5276     }
5277   }
5278
5279   if (try_prune_purged_snaps()) {
5280     do_propose = true;
5281   }
5282
5283   if (update_pools_status())
5284     do_propose = true;
5285
5286   if (do_propose ||
5287       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5288     propose_pending();
5289 }
5290
5291 void OSDMonitor::_set_new_cache_sizes()
5292 {
5293   uint64_t cache_size = 0;
5294   int64_t inc_alloc = 0;
5295   int64_t full_alloc = 0;
5296   int64_t kv_alloc = 0;
5297
5298   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5299     cache_size = pcm->get_tuned_mem();
5300     inc_alloc = inc_cache->get_committed_size();
5301     full_alloc = full_cache->get_committed_size();
5302     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5303   }
5304
5305   inc_osd_cache.set_bytes(inc_alloc);
5306   full_osd_cache.set_bytes(full_alloc);
5307
5308   dout(1) << __func__ << " cache_size:" << cache_size
5309            << " inc_alloc: " << inc_alloc
5310            << " full_alloc: " << full_alloc
5311            << " kv_alloc: " << kv_alloc
5312            << dendl;
5313 }
5314
5315 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5316                                      std::map<int, std::pair<utime_t, int>> &last_osd_report)
5317 {
5318   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5319   if (now - mon.get_leader_since() < timeo) {
5320     // We haven't been the leader for long enough to consider OSD timeouts
5321     return false;
5322   }
5323
5324   int max_osd = osdmap.get_max_osd();
5325   bool new_down = false;
5326
5327   for (int i=0; i < max_osd; ++i) {
5328     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5329     if (!osdmap.exists(i)) {
5330       last_osd_report.erase(i); // if any
5331       continue;
5332     }
5333     if (!osdmap.is_up(i))
5334       continue;
5335     const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5336     if (t == last_osd_report.end()) {
5337       // it wasn't in the map; start the timer.
5338       last_osd_report[i].first = now;
5339       last_osd_report[i].second = 0;
5340     } else if (can_mark_down(i)) {
5341       utime_t diff = now - t->second.first;
5342       // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5343       // to allow for the osd to miss a beacon.
5344       int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5345       utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
5346       if (diff > max_timeout) {
5347         mon.clog->info() << "osd." << i << " marked down after no beacon for "
5348                           << diff << " seconds";
5349         derr << "no beacon from osd." << i << " since " << t->second.first
5350              << ", " << diff << " seconds ago.  marking down" << dendl;
5351         pending_inc.new_state[i] = CEPH_OSD_UP;
5352         new_down = true;
5353       }
5354     }
5355   }
5356   return new_down;
5357 }
5358
5359 static void dump_cpu_list(Formatter *f, const char *name,
5360                           const string& strlist)
5361 {
5362   cpu_set_t cpu_set;
5363   size_t cpu_set_size;
5364   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5365     return;
5366   }
5367   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5368   f->open_array_section(name);
5369   for (auto cpu : cpus) {
5370     f->dump_int("cpu", cpu);
5371   }
5372   f->close_section();
5373 }
5374
5375 void OSDMonitor::dump_info(Formatter *f)
5376 {
5377   f->open_object_section("osdmap");
5378   osdmap.dump(f, cct);
5379   f->close_section();
5380
5381   f->open_array_section("osd_metadata");
5382   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5383     if (osdmap.exists(i)) {
5384       f->open_object_section("osd");
5385       f->dump_unsigned("id", i);
5386       dump_osd_metadata(i, f, NULL);
5387       f->close_section();
5388     }
5389   }
5390   f->close_section();
5391
5392   f->open_object_section("osdmap_clean_epochs");
5393   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5394
5395   f->open_object_section("last_epoch_clean");
5396   last_epoch_clean.dump(f);
5397   f->close_section();
5398
5399   f->open_array_section("osd_epochs");
5400   for (auto& osd_epoch : osd_epochs) {
5401     f->open_object_section("osd");
5402     f->dump_unsigned("id", osd_epoch.first);
5403     f->dump_unsigned("epoch", osd_epoch.second);
5404     f->close_section();
5405   }
5406   f->close_section(); // osd_epochs
5407
5408   f->close_section(); // osd_clean_epochs
5409
5410   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5411   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5412
5413   f->open_object_section("crushmap");
5414   osdmap.crush->dump(f);
5415   f->close_section();
5416
5417   if (has_osdmap_manifest) {
5418     f->open_object_section("osdmap_manifest");
5419     osdmap_manifest.dump(f);
5420     f->close_section();
5421   }
5422 }
5423
5424 namespace {
5425   enum osd_pool_get_choices {
5426     SIZE, MIN_SIZE,
5427     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5428     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5429     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5430     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5431     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5432     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5433     CACHE_TARGET_FULL_RATIO,
5434     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5435     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5436     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5437     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5438     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5439     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5440     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5441     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5442     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5443     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5444     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5445     DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5446
5447   std::set<osd_pool_get_choices>
5448     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5449                                 const std::set<osd_pool_get_choices>& second)
5450     {
5451       std::set<osd_pool_get_choices> result;
5452       std::set_difference(first.begin(), first.end(),
5453                           second.begin(), second.end(),
5454                           std::inserter(result, result.end()));
5455       return result;
5456     }
5457 }
5458
5459
5460 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5461 {
5462   op->mark_osdmon_event(__func__);
5463   auto m = op->get_req<MMonCommand>();
5464   int r = 0;
5465   bufferlist rdata;
5466   stringstream ss, ds;
5467
5468   cmdmap_t cmdmap;
5469   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5470     string rs = ss.str();
5471     mon.reply_command(op, -EINVAL, rs, get_last_committed());
5472     return true;
5473   }
5474
5475   MonSession *session = op->get_session();
5476   if (!session) {
5477     derr << __func__ << " no session" << dendl;
5478     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5479     return true;
5480   }
5481
5482   string prefix;
5483   cmd_getval(cmdmap, "prefix", prefix);
5484
5485   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5486   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5487
5488   if (prefix == "osd stat") {
5489     if (f) {
5490       f->open_object_section("osdmap");
5491       osdmap.print_summary(f.get(), ds, "", true);
5492       f->close_section();
5493       f->flush(rdata);
5494     } else {
5495       osdmap.print_summary(nullptr, ds, "", true);
5496       rdata.append(ds);
5497     }
5498   }
5499   else if (prefix == "osd dump" ||
5500            prefix == "osd tree" ||
5501            prefix == "osd tree-from" ||
5502            prefix == "osd ls" ||
5503            prefix == "osd getmap" ||
5504            prefix == "osd getcrushmap" ||
5505            prefix == "osd ls-tree" ||
5506            prefix == "osd info") {
5507
5508     epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5509     bufferlist osdmap_bl;
5510     int err = get_version_full(epoch, osdmap_bl);
5511     if (err == -ENOENT) {
5512       r = -ENOENT;
5513       ss << "there is no map for epoch " << epoch;
5514       goto reply;
5515     }
5516     ceph_assert(err == 0);
5517     ceph_assert(osdmap_bl.length());
5518
5519     OSDMap *p;
5520     if (epoch == osdmap.get_epoch()) {
5521       p = &osdmap;
5522     } else {
5523       p = new OSDMap;
5524       p->decode(osdmap_bl);
5525     }
5526
5527     auto sg = make_scope_guard([&] {
5528       if (p != &osdmap) {
5529         delete p;
5530       }
5531     });
5532
5533     if (prefix == "osd dump") {
5534       stringstream ds;
5535       if (f) {
5536         f->open_object_section("osdmap");
5537         p->dump(f.get(), cct);
5538         f->close_section();
5539         f->flush(ds);
5540       } else {
5541         p->print(cct, ds);
5542       }
5543       rdata.append(ds);
5544       if (!f)
5545         ds << " ";
5546     } else if (prefix == "osd ls") {
5547       if (f) {
5548         f->open_array_section("osds");
5549         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5550           if (osdmap.exists(i)) {
5551             f->dump_int("osd", i);
5552           }
5553         }
5554         f->close_section();
5555         f->flush(ds);
5556       } else {
5557         bool first = true;
5558         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5559           if (osdmap.exists(i)) {
5560             if (!first)
5561               ds << "\n";
5562             first = false;
5563             ds << i;
5564           }
5565         }
5566       }
5567       rdata.append(ds);
5568     } else if (prefix == "osd info") {
5569       int64_t osd_id;
5570       bool do_single_osd = true;
5571       if (!cmd_getval(cmdmap, "id", osd_id)) {
5572         do_single_osd = false;
5573       }
5574
5575       if (do_single_osd && !osdmap.exists(osd_id)) {
5576         ss << "osd." << osd_id << " does not exist";
5577         r = -EINVAL;
5578         goto reply;
5579       }
5580
5581       if (f) {
5582         if (do_single_osd) {
5583           osdmap.dump_osd(osd_id, f.get());
5584         } else {
5585           osdmap.dump_osds(f.get());
5586         }
5587         f->flush(ds);
5588       } else {
5589         if (do_single_osd) {
5590           osdmap.print_osd(osd_id, ds);
5591         } else {
5592           osdmap.print_osds(ds);
5593         }
5594       }
5595       rdata.append(ds);
5596     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5597       string bucket;
5598       if (prefix == "osd tree-from") {
5599         cmd_getval(cmdmap, "bucket", bucket);
5600         if (!osdmap.crush->name_exists(bucket)) {
5601           ss << "bucket '" << bucket << "' does not exist";
5602           r = -ENOENT;
5603           goto reply;
5604         }
5605         int id = osdmap.crush->get_item_id(bucket);
5606         if (id >= 0) {
5607           ss << "\"" << bucket << "\" is not a bucket";
5608           r = -EINVAL;
5609           goto reply;
5610         }
5611       }
5612
5613       vector<string> states;
5614       cmd_getval(cmdmap, "states", states);
5615       unsigned filter = 0;
5616       for (auto& s : states) {
5617         if (s == "up") {
5618           filter |= OSDMap::DUMP_UP;
5619         } else if (s == "down") {
5620           filter |= OSDMap::DUMP_DOWN;
5621         } else if (s == "in") {
5622           filter |= OSDMap::DUMP_IN;
5623         } else if (s == "out") {
5624           filter |= OSDMap::DUMP_OUT;
5625         } else if (s == "destroyed") {
5626           filter |= OSDMap::DUMP_DESTROYED;
5627         } else {
5628           ss << "unrecognized state '" << s << "'";
5629           r = -EINVAL;
5630           goto reply;
5631         }
5632       }
5633       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5634           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5635         ss << "cannot specify both 'in' and 'out'";
5636         r = -EINVAL;
5637         goto reply;
5638       }
5639       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5640            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5641            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5642            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5643            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5644            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5645         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5646         r = -EINVAL;
5647         goto reply;
5648       }
5649       if (f) {
5650         f->open_object_section("tree");
5651         p->print_tree(f.get(), NULL, filter, bucket);
5652         f->close_section();
5653         f->flush(ds);
5654       } else {
5655         p->print_tree(NULL, &ds, filter, bucket);
5656       }
5657       rdata.append(ds);
5658     } else if (prefix == "osd getmap") {
5659       rdata.append(osdmap_bl);
5660       ss << "got osdmap epoch " << p->get_epoch();
5661     } else if (prefix == "osd getcrushmap") {
5662       p->crush->encode(rdata, mon.get_quorum_con_features());
5663       ss << p->get_crush_version();
5664     } else if (prefix == "osd ls-tree") {
5665       string bucket_name;
5666       cmd_getval(cmdmap, "name", bucket_name);
5667       set<int> osds;
5668       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5669       if (r == -ENOENT) {
5670         ss << "\"" << bucket_name << "\" does not exist";
5671         goto reply;
5672       } else if (r < 0) {
5673         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5674         goto reply;
5675       }
5676
5677       if (f) {
5678         f->open_array_section("osds");
5679         for (auto &i : osds) {
5680           if (osdmap.exists(i)) {
5681             f->dump_int("osd", i);
5682           }
5683         }
5684         f->close_section();
5685         f->flush(ds);
5686       } else {
5687         bool first = true;
5688         for (auto &i : osds) {
5689           if (osdmap.exists(i)) {
5690             if (!first)
5691               ds << "\n";
5692             first = false;
5693             ds << i;
5694           }
5695         }
5696       }
5697
5698       rdata.append(ds);
5699     }
5700   } else if (prefix == "osd getmaxosd") {
5701     if (f) {
5702       f->open_object_section("getmaxosd");
5703       f->dump_unsigned("epoch", osdmap.get_epoch());
5704       f->dump_int("max_osd", osdmap.get_max_osd());
5705       f->close_section();
5706       f->flush(rdata);
5707     } else {
5708       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5709       rdata.append(ds);
5710     }
5711   } else if (prefix == "osd utilization") {
5712     string out;
5713     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5714     if (f)
5715       f->flush(rdata);
5716     else
5717       rdata.append(out);
5718     r = 0;
5719     goto reply;
5720   } else if (prefix  == "osd find") {
5721     int64_t osd;
5722     if (!cmd_getval(cmdmap, "id", osd)) {
5723       ss << "unable to parse osd id value '"
5724          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5725       r = -EINVAL;
5726       goto reply;
5727     }
5728     if (!osdmap.exists(osd)) {
5729       ss << "osd." << osd << " does not exist";
5730       r = -ENOENT;
5731       goto reply;
5732     }
5733     string format;
5734     cmd_getval(cmdmap, "format", format);
5735     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5736     f->open_object_section("osd_location");
5737     f->dump_int("osd", osd);
5738     f->dump_object("addrs", osdmap.get_addrs(osd));
5739     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5740
5741     // try to identify host, pod/container name, etc.
5742     map<string,string> m;
5743     load_metadata(osd, m, nullptr);
5744     if (auto p = m.find("hostname"); p != m.end()) {
5745       f->dump_string("host", p->second);
5746     }
5747     for (auto& k : {
5748         "pod_name", "pod_namespace", // set by rook
5749         "container_name"             // set by cephadm, ceph-ansible
5750         }) {
5751       if (auto p = m.find(k); p != m.end()) {
5752         f->dump_string(k, p->second);
5753       }
5754     }
5755
5756     // crush is helpful too
5757     f->open_object_section("crush_location");
5758     map<string,string> loc = osdmap.crush->get_full_location(osd);
5759     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5760       f->dump_string(p->first.c_str(), p->second);
5761     f->close_section();
5762     f->close_section();
5763     f->flush(rdata);
5764   } else if (prefix == "osd metadata") {
5765     int64_t osd = -1;
5766     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5767         !cmd_getval(cmdmap, "id", osd)) {
5768       ss << "unable to parse osd id value '"
5769          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5770       r = -EINVAL;
5771       goto reply;
5772     }
5773     if (osd >= 0 && !osdmap.exists(osd)) {
5774       ss << "osd." << osd << " does not exist";
5775       r = -ENOENT;
5776       goto reply;
5777     }
5778     string format;
5779     cmd_getval(cmdmap, "format", format);
5780     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5781     if (osd >= 0) {
5782       f->open_object_section("osd_metadata");
5783       f->dump_unsigned("id", osd);
5784       r = dump_osd_metadata(osd, f.get(), &ss);
5785       if (r < 0)
5786         goto reply;
5787       f->close_section();
5788     } else {
5789       r = 0;
5790       f->open_array_section("osd_metadata");
5791       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5792         if (osdmap.exists(i)) {
5793           f->open_object_section("osd");
5794           f->dump_unsigned("id", i);
5795           r = dump_osd_metadata(i, f.get(), NULL);
5796           if (r == -EINVAL || r == -ENOENT) {
5797             // Drop error, continue to get other daemons' metadata
5798             dout(4) << "No metadata for osd." << i << dendl;
5799             r = 0;
5800           } else if (r < 0) {
5801             // Unexpected error
5802             goto reply;
5803           }
5804           f->close_section();
5805         }
5806       }
5807       f->close_section();
5808     }
5809     f->flush(rdata);
5810   } else if (prefix == "osd versions") {
5811     if (!f)
5812       f.reset(Formatter::create("json-pretty"));
5813     count_metadata("ceph_version", f.get());
5814     f->flush(rdata);
5815     r = 0;
5816   } else if (prefix == "osd count-metadata") {
5817     if (!f)
5818       f.reset(Formatter::create("json-pretty"));
5819     string field;
5820     cmd_getval(cmdmap, "property", field);
5821     count_metadata(field, f.get());
5822     f->flush(rdata);
5823     r = 0;
5824   } else if (prefix == "osd numa-status") {
5825     TextTable tbl;
5826     if (f) {
5827       f->open_array_section("osds");
5828     } else {
5829       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5830       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5831       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5832       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5833       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5834       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5835     }
5836     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5837       if (osdmap.exists(i)) {
5838         map<string,string> m;
5839         ostringstream err;
5840         if (load_metadata(i, m, &err) < 0) {
5841           continue;
5842         }
5843         string host;
5844         auto p = m.find("hostname");
5845         if (p != m.end()) {
5846           host = p->second;
5847         }
5848         if (f) {
5849           f->open_object_section("osd");
5850           f->dump_int("osd", i);
5851           f->dump_string("host", host);
5852           for (auto n : { "network_numa_node", "objectstore_numa_node",
5853                 "numa_node" }) {
5854             p = m.find(n);
5855             if (p != m.end()) {
5856               f->dump_int(n, atoi(p->second.c_str()));
5857             }
5858           }
5859           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5860             p = m.find(n);
5861             if (p != m.end()) {
5862               list<string> ls = get_str_list(p->second, ",");
5863               f->open_array_section(n);
5864               for (auto node : ls) {
5865                 f->dump_int("node", atoi(node.c_str()));
5866               }
5867               f->close_section();
5868             }
5869           }
5870           for (auto n : { "numa_node_cpus" }) {
5871             p = m.find(n);
5872             if (p != m.end()) {
5873               dump_cpu_list(f.get(), n, p->second);
5874             }
5875           }
5876           f->close_section();
5877         } else {
5878           tbl << i;
5879           tbl << host;
5880           p = m.find("network_numa_nodes");
5881           if (p != m.end()) {
5882             tbl << p->second;
5883           } else {
5884             tbl << "-";
5885           }
5886           p = m.find("objectstore_numa_nodes");
5887           if (p != m.end()) {
5888             tbl << p->second;
5889           } else {
5890             tbl << "-";
5891           }
5892           p = m.find("numa_node");
5893           auto q = m.find("numa_node_cpus");
5894           if (p != m.end() && q != m.end()) {
5895             tbl << p->second;
5896             tbl << q->second;
5897           } else {
5898             tbl << "-";
5899             tbl << "-";
5900           }
5901           tbl << TextTable::endrow;
5902         }
5903       }
5904     }
5905     if (f) {
5906       f->close_section();
5907       f->flush(rdata);
5908     } else {
5909       rdata.append(stringify(tbl));
5910     }
5911   } else if (prefix == "osd map") {
5912     string poolstr, objstr, namespacestr;
5913     cmd_getval(cmdmap, "pool", poolstr);
5914     cmd_getval(cmdmap, "object", objstr);
5915     cmd_getval(cmdmap, "nspace", namespacestr);
5916
5917     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5918     if (pool < 0) {
5919       ss << "pool " << poolstr << " does not exist";
5920       r = -ENOENT;
5921       goto reply;
5922     }
5923     object_locator_t oloc(pool, namespacestr);
5924     object_t oid(objstr);
5925     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5926     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5927     vector<int> up, acting;
5928     int up_p, acting_p;
5929     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5930
5931     string fullobjname;
5932     if (!namespacestr.empty())
5933       fullobjname = namespacestr + string("/") + oid.name;
5934     else
5935       fullobjname = oid.name;
5936     if (f) {
5937       f->open_object_section("osd_map");
5938       f->dump_unsigned("epoch", osdmap.get_epoch());
5939       f->dump_string("pool", poolstr);
5940       f->dump_int("pool_id", pool);
5941       f->dump_stream("objname") << fullobjname;
5942       f->dump_stream("raw_pgid") << pgid;
5943       f->dump_stream("pgid") << mpgid;
5944       f->open_array_section("up");
5945       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5946         f->dump_int("osd", *p);
5947       f->close_section();
5948       f->dump_int("up_primary", up_p);
5949       f->open_array_section("acting");
5950       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5951         f->dump_int("osd", *p);
5952       f->close_section();
5953       f->dump_int("acting_primary", acting_p);
5954       f->close_section(); // osd_map
5955       f->flush(rdata);
5956     } else {
5957       ds << "osdmap e" << osdmap.get_epoch()
5958         << " pool '" << poolstr << "' (" << pool << ")"
5959         << " object '" << fullobjname << "' ->"
5960         << " pg " << pgid << " (" << mpgid << ")"
5961         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5962         << pg_vector_string(acting) << ", p" << acting_p << ")";
5963       rdata.append(ds);
5964     }
5965
5966   } else if (prefix == "pg map") {
5967     pg_t pgid;
5968     vector<int> up, acting;
5969     r = parse_pgid(cmdmap, ss, pgid);
5970     if (r < 0)
5971       goto reply;
5972     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5973     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5974     if (f) {
5975       f->open_object_section("pg_map");
5976       f->dump_unsigned("epoch", osdmap.get_epoch());
5977       f->dump_stream("raw_pgid") << pgid;
5978       f->dump_stream("pgid") << mpgid;
5979       f->open_array_section("up");
5980       for (auto osd : up) {
5981         f->dump_int("up_osd", osd);
5982       }
5983       f->close_section();
5984       f->open_array_section("acting");
5985       for (auto osd : acting) {
5986         f->dump_int("acting_osd", osd);
5987       }
5988       f->close_section();
5989       f->close_section();
5990       f->flush(rdata);
5991     } else {
5992       ds << "osdmap e" << osdmap.get_epoch()
5993          << " pg " << pgid << " (" << mpgid << ")"
5994          << " -> up " << up << " acting " << acting;
5995       rdata.append(ds);
5996     }
5997     goto reply;
5998
5999   } else if (prefix == "osd lspools") {
6000     if (f)
6001       f->open_array_section("pools");
6002     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6003          p != osdmap.pools.end();
6004          ++p) {
6005       if (f) {
6006         f->open_object_section("pool");
6007         f->dump_int("poolnum", p->first);
6008         f->dump_string("poolname", osdmap.pool_name[p->first]);
6009         f->close_section();
6010       } else {
6011         ds << p->first << ' ' << osdmap.pool_name[p->first];
6012         if (next(p) != osdmap.pools.end()) {
6013           ds << '\n';
6014         }
6015       }
6016     }
6017     if (f) {
6018       f->close_section();
6019       f->flush(ds);
6020     }
6021     rdata.append(ds);
6022   } else if (prefix == "osd blocklist ls" ||
6023              prefix == "osd blacklist ls") {
6024     if (f)
6025       f->open_array_section("blocklist");
6026
6027     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6028          p != osdmap.blocklist.end();
6029          ++p) {
6030       if (f) {
6031         f->open_object_section("entry");
6032         f->dump_string("addr", p->first.get_legacy_str());
6033         f->dump_stream("until") << p->second;
6034         f->close_section();
6035       } else {
6036         stringstream ss;
6037         string s;
6038         ss << p->first << " " << p->second;
6039         getline(ss, s);
6040         s += "\n";
6041         rdata.append(s);
6042       }
6043     }
6044     if (f) {
6045       f->close_section();
6046       f->flush(rdata);
6047     }
6048     if (f)
6049       f->open_array_section("range_blocklist");
6050
6051     for (auto p = osdmap.range_blocklist.begin();
6052          p != osdmap.range_blocklist.end();
6053          ++p) {
6054       if (f) {
6055         f->open_object_section("entry");
6056         f->dump_string("range", p->first.get_legacy_str());
6057         f->dump_stream("until") << p->second;
6058         f->close_section();
6059       } else {
6060         stringstream ss;
6061         string s;
6062         ss << p->first << " " << p->second;
6063         getline(ss, s);
6064         s += "\n";
6065         rdata.append(s);
6066       }
6067     }
6068     if (f) {
6069       f->close_section();
6070       f->flush(rdata);
6071     }
6072     ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6073
6074   } else if (prefix == "osd pool ls") {
6075     string detail;
6076     cmd_getval(cmdmap, "detail", detail);
6077     if (!f && detail == "detail") {
6078       ostringstream ss;
6079       osdmap.print_pools(cct, ss);
6080       rdata.append(ss.str());
6081     } else {
6082       if (f)
6083         f->open_array_section("pools");
6084       for (auto &[pid, pdata] : osdmap.get_pools()) {
6085         if (f) {
6086           if (detail == "detail") {
6087             f->open_object_section("pool");
6088             f->dump_int("pool_id", pid);
6089             f->dump_string("pool_name", osdmap.get_pool_name(pid));
6090             pdata.dump(f.get());
6091             osdmap.dump_read_balance_score(cct, pid, pdata, f.get());
6092             f->close_section();
6093           } else {
6094             f->dump_string("pool_name", osdmap.get_pool_name(pid));
6095           }
6096         } else {
6097           rdata.append(osdmap.get_pool_name(pid) + "\n");
6098         }
6099       }
6100       if (f) {
6101         f->close_section();
6102         f->flush(rdata);
6103       }
6104     }
6105
6106   } else if (prefix == "osd crush get-tunable") {
6107     string tunable;
6108     cmd_getval(cmdmap, "tunable", tunable);
6109     ostringstream rss;
6110     if (f)
6111       f->open_object_section("tunable");
6112     if (tunable == "straw_calc_version") {
6113       if (f)
6114         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6115       else
6116         rss << osdmap.crush->get_straw_calc_version() << "\n";
6117     } else {
6118       r = -EINVAL;
6119       goto reply;
6120     }
6121     if (f) {
6122       f->close_section();
6123       f->flush(rdata);
6124     } else {
6125       rdata.append(rss.str());
6126     }
6127     r = 0;
6128
6129   } else if (prefix == "osd pool get") {
6130     string poolstr;
6131     cmd_getval(cmdmap, "pool", poolstr);
6132     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6133     if (pool < 0) {
6134       ss << "unrecognized pool '" << poolstr << "'";
6135       r = -ENOENT;
6136       goto reply;
6137     }
6138
6139     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6140     string var;
6141     cmd_getval(cmdmap, "var", var);
6142
6143     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6144     const choices_map_t ALL_CHOICES = {
6145       {"size", SIZE},
6146       {"min_size", MIN_SIZE},
6147       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6148       {"crush_rule", CRUSH_RULE},
6149       {"hashpspool", HASHPSPOOL},
6150       {"eio", POOL_EIO},
6151       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6152       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6153       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6154       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6155       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6156       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6157       {"use_gmt_hitset", USE_GMT_HITSET},
6158       {"target_max_objects", TARGET_MAX_OBJECTS},
6159       {"target_max_bytes", TARGET_MAX_BYTES},
6160       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6161       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6162       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6163       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6164       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6165       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6166       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6167       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6168       {"fast_read", FAST_READ},
6169       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6170       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6171       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6172       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6173       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6174       {"recovery_priority", RECOVERY_PRIORITY},
6175       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6176       {"scrub_priority", SCRUB_PRIORITY},
6177       {"compression_mode", COMPRESSION_MODE},
6178       {"compression_algorithm", COMPRESSION_ALGORITHM},
6179       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6180       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6181       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6182       {"csum_type", CSUM_TYPE},
6183       {"csum_max_block", CSUM_MAX_BLOCK},
6184       {"csum_min_block", CSUM_MIN_BLOCK},
6185       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6186       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6187       {"pg_num_min", PG_NUM_MIN},
6188       {"pg_num_max", PG_NUM_MAX},
6189       {"target_size_bytes", TARGET_SIZE_BYTES},
6190       {"target_size_ratio", TARGET_SIZE_RATIO},
6191       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6192       {"dedup_tier", DEDUP_TIER},
6193       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6194       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6195       {"bulk", BULK}
6196     };
6197
6198     typedef std::set<osd_pool_get_choices> choices_set_t;
6199
6200     const choices_set_t ONLY_TIER_CHOICES = {
6201       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6202       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6203       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6204       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6205       MIN_READ_RECENCY_FOR_PROMOTE,
6206       MIN_WRITE_RECENCY_FOR_PROMOTE,
6207       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6208     };
6209     const choices_set_t ONLY_ERASURE_CHOICES = {
6210       EC_OVERWRITES, ERASURE_CODE_PROFILE
6211     };
6212
6213     choices_set_t selected_choices;
6214     if (var == "all") {
6215       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6216           it != ALL_CHOICES.end(); ++it) {
6217         selected_choices.insert(it->second);
6218       }
6219
6220       if(!p->is_tier()) {
6221         selected_choices = subtract_second_from_first(selected_choices,
6222                                                       ONLY_TIER_CHOICES);
6223       }
6224
6225       if(!p->is_erasure()) {
6226         selected_choices = subtract_second_from_first(selected_choices,
6227                                                       ONLY_ERASURE_CHOICES);
6228       }
6229     } else /* var != "all" */  {
6230       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6231       if (found == ALL_CHOICES.end()) {
6232         ss << "pool '" << poolstr
6233                << "': invalid variable: '" << var << "'";
6234         r = -EINVAL;
6235         goto reply;
6236       }
6237
6238       osd_pool_get_choices selected = found->second;
6239
6240       if (!p->is_tier() &&
6241           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6242         ss << "pool '" << poolstr
6243            << "' is not a tier pool: variable not applicable";
6244         r = -EACCES;
6245         goto reply;
6246       }
6247
6248       if (!p->is_erasure() &&
6249           ONLY_ERASURE_CHOICES.find(selected)
6250           != ONLY_ERASURE_CHOICES.end()) {
6251         ss << "pool '" << poolstr
6252            << "' is not a erasure pool: variable not applicable";
6253         r = -EACCES;
6254         goto reply;
6255       }
6256
6257       if (pool_opts_t::is_opt_name(var) &&
6258           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6259         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6260         r = -ENOENT;
6261         goto reply;
6262       }
6263
6264       selected_choices.insert(selected);
6265     }
6266
6267     if (f) {
6268       f->open_object_section("pool");
6269       f->dump_string("pool", poolstr);
6270       f->dump_int("pool_id", pool);
6271       for(choices_set_t::const_iterator it = selected_choices.begin();
6272           it != selected_choices.end(); ++it) {
6273         choices_map_t::const_iterator i;
6274         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6275           if (i->second == *it) {
6276             break;
6277           }
6278         }
6279         ceph_assert(i != ALL_CHOICES.end());
6280         switch(*it) {
6281           case PG_NUM:
6282             f->dump_int("pg_num", p->get_pg_num());
6283             break;
6284           case PGP_NUM:
6285             f->dump_int("pgp_num", p->get_pgp_num());
6286             break;
6287           case SIZE:
6288             f->dump_int("size", p->get_size());
6289             break;
6290           case MIN_SIZE:
6291             f->dump_int("min_size", p->get_min_size());
6292             break;
6293           case CRUSH_RULE:
6294             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6295               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6296                                p->get_crush_rule()));
6297             } else {
6298               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6299             }
6300             break;
6301           case EC_OVERWRITES:
6302             f->dump_bool("allow_ec_overwrites",
6303                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6304             break;
6305           case PG_AUTOSCALE_MODE:
6306             f->dump_string("pg_autoscale_mode",
6307                            pg_pool_t::get_pg_autoscale_mode_name(
6308                              p->pg_autoscale_mode));
6309             break;
6310           case HASHPSPOOL:
6311           case POOL_EIO:
6312           case NODELETE:
6313           case BULK:
6314           case NOPGCHANGE:
6315           case NOSIZECHANGE:
6316           case WRITE_FADVISE_DONTNEED:
6317           case NOSCRUB:
6318           case NODEEP_SCRUB:
6319             f->dump_bool(i->first.c_str(),
6320                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6321             break;
6322           case HIT_SET_PERIOD:
6323             f->dump_int("hit_set_period", p->hit_set_period);
6324             break;
6325           case HIT_SET_COUNT:
6326             f->dump_int("hit_set_count", p->hit_set_count);
6327             break;
6328           case HIT_SET_TYPE:
6329             f->dump_string("hit_set_type",
6330                            HitSet::get_type_name(p->hit_set_params.get_type()));
6331             break;
6332           case HIT_SET_FPP:
6333             {
6334               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6335                 BloomHitSet::Params *bloomp =
6336                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6337                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6338               } else if(var != "all") {
6339                 f->close_section();
6340                 ss << "hit set is not of type Bloom; " <<
6341                   "invalid to get a false positive rate!";
6342                 r = -EINVAL;
6343                 goto reply;
6344               }
6345             }
6346             break;
6347           case USE_GMT_HITSET:
6348             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6349             break;
6350           case TARGET_MAX_OBJECTS:
6351             f->dump_unsigned("target_max_objects", p->target_max_objects);
6352             break;
6353           case TARGET_MAX_BYTES:
6354             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6355             break;
6356           case CACHE_TARGET_DIRTY_RATIO:
6357             f->dump_unsigned("cache_target_dirty_ratio_micro",
6358                              p->cache_target_dirty_ratio_micro);
6359             f->dump_float("cache_target_dirty_ratio",
6360                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6361             break;
6362           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6363             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6364                              p->cache_target_dirty_high_ratio_micro);
6365             f->dump_float("cache_target_dirty_high_ratio",
6366                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6367             break;
6368           case CACHE_TARGET_FULL_RATIO:
6369             f->dump_unsigned("cache_target_full_ratio_micro",
6370                              p->cache_target_full_ratio_micro);
6371             f->dump_float("cache_target_full_ratio",
6372                           ((float)p->cache_target_full_ratio_micro/1000000));
6373             break;
6374           case CACHE_MIN_FLUSH_AGE:
6375             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6376             break;
6377           case CACHE_MIN_EVICT_AGE:
6378             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6379             break;
6380           case ERASURE_CODE_PROFILE:
6381             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6382             break;
6383           case MIN_READ_RECENCY_FOR_PROMOTE:
6384             f->dump_int("min_read_recency_for_promote",
6385                         p->min_read_recency_for_promote);
6386             break;
6387           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6388             f->dump_int("min_write_recency_for_promote",
6389                         p->min_write_recency_for_promote);
6390             break;
6391           case FAST_READ:
6392             f->dump_int("fast_read", p->fast_read);
6393             break;
6394           case HIT_SET_GRADE_DECAY_RATE:
6395             f->dump_int("hit_set_grade_decay_rate",
6396                         p->hit_set_grade_decay_rate);
6397             break;
6398           case HIT_SET_SEARCH_LAST_N:
6399             f->dump_int("hit_set_search_last_n",
6400                         p->hit_set_search_last_n);
6401             break;
6402           case SCRUB_MIN_INTERVAL:
6403           case SCRUB_MAX_INTERVAL:
6404           case DEEP_SCRUB_INTERVAL:
6405           case RECOVERY_PRIORITY:
6406           case RECOVERY_OP_PRIORITY:
6407           case SCRUB_PRIORITY:
6408           case COMPRESSION_MODE:
6409           case COMPRESSION_ALGORITHM:
6410           case COMPRESSION_REQUIRED_RATIO:
6411           case COMPRESSION_MAX_BLOB_SIZE:
6412           case COMPRESSION_MIN_BLOB_SIZE:
6413           case CSUM_TYPE:
6414           case CSUM_MAX_BLOCK:
6415           case CSUM_MIN_BLOCK:
6416           case FINGERPRINT_ALGORITHM:
6417           case PG_NUM_MIN:
6418           case PG_NUM_MAX:
6419           case TARGET_SIZE_BYTES:
6420           case TARGET_SIZE_RATIO:
6421           case PG_AUTOSCALE_BIAS:
6422           case DEDUP_TIER:
6423           case DEDUP_CHUNK_ALGORITHM:
6424           case DEDUP_CDC_CHUNK_SIZE:
6425             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6426             if (p->opts.is_set(key)) {
6427               if(*it == CSUM_TYPE) {
6428                 int64_t val;
6429                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6430                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6431               } else {
6432                 p->opts.dump(i->first, f.get());
6433               }
6434             }
6435             break;
6436         }
6437       }
6438       f->close_section();
6439       f->flush(rdata);
6440     } else /* !f */ {
6441       for(choices_set_t::const_iterator it = selected_choices.begin();
6442           it != selected_choices.end(); ++it) {
6443         choices_map_t::const_iterator i;
6444         switch(*it) {
6445           case PG_NUM:
6446             ss << "pg_num: " << p->get_pg_num() << "\n";
6447             break;
6448           case PGP_NUM:
6449             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6450             break;
6451           case SIZE:
6452             ss << "size: " << p->get_size() << "\n";
6453             break;
6454           case MIN_SIZE:
6455             ss << "min_size: " << p->get_min_size() << "\n";
6456             break;
6457           case CRUSH_RULE:
6458             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6459               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6460                 p->get_crush_rule()) << "\n";
6461             } else {
6462               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6463             }
6464             break;
6465           case PG_AUTOSCALE_MODE:
6466             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6467               p->pg_autoscale_mode) <<"\n";
6468             break;
6469           case HIT_SET_PERIOD:
6470             ss << "hit_set_period: " << p->hit_set_period << "\n";
6471             break;
6472           case HIT_SET_COUNT:
6473             ss << "hit_set_count: " << p->hit_set_count << "\n";
6474             break;
6475           case HIT_SET_TYPE:
6476             ss << "hit_set_type: " <<
6477               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6478             break;
6479           case HIT_SET_FPP:
6480             {
6481               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6482                 BloomHitSet::Params *bloomp =
6483                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6484                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6485               } else if(var != "all") {
6486                 ss << "hit set is not of type Bloom; " <<
6487                   "invalid to get a false positive rate!";
6488                 r = -EINVAL;
6489                 goto reply;
6490               }
6491             }
6492             break;
6493           case USE_GMT_HITSET:
6494             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6495             break;
6496           case TARGET_MAX_OBJECTS:
6497             ss << "target_max_objects: " << p->target_max_objects << "\n";
6498             break;
6499           case TARGET_MAX_BYTES:
6500             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6501             break;
6502           case CACHE_TARGET_DIRTY_RATIO:
6503             ss << "cache_target_dirty_ratio: "
6504                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6505             break;
6506           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6507             ss << "cache_target_dirty_high_ratio: "
6508                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6509             break;
6510           case CACHE_TARGET_FULL_RATIO:
6511             ss << "cache_target_full_ratio: "
6512                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6513             break;
6514           case CACHE_MIN_FLUSH_AGE:
6515             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6516             break;
6517           case CACHE_MIN_EVICT_AGE:
6518             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6519             break;
6520           case ERASURE_CODE_PROFILE:
6521             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6522             break;
6523           case MIN_READ_RECENCY_FOR_PROMOTE:
6524             ss << "min_read_recency_for_promote: " <<
6525               p->min_read_recency_for_promote << "\n";
6526             break;
6527           case HIT_SET_GRADE_DECAY_RATE:
6528             ss << "hit_set_grade_decay_rate: " <<
6529               p->hit_set_grade_decay_rate << "\n";
6530             break;
6531           case HIT_SET_SEARCH_LAST_N:
6532             ss << "hit_set_search_last_n: " <<
6533               p->hit_set_search_last_n << "\n";
6534             break;
6535           case EC_OVERWRITES:
6536             ss << "allow_ec_overwrites: " <<
6537               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6538               "\n";
6539             break;
6540           case HASHPSPOOL:
6541           case POOL_EIO:
6542           case NODELETE:
6543           case BULK:
6544           case NOPGCHANGE:
6545           case NOSIZECHANGE:
6546           case WRITE_FADVISE_DONTNEED:
6547           case NOSCRUB:
6548           case NODEEP_SCRUB:
6549             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6550               if (i->second == *it)
6551                 break;
6552             }
6553             ceph_assert(i != ALL_CHOICES.end());
6554             ss << i->first << ": " <<
6555               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6556                "true" : "false") << "\n";
6557             break;
6558           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6559             ss << "min_write_recency_for_promote: " <<
6560               p->min_write_recency_for_promote << "\n";
6561             break;
6562           case FAST_READ:
6563             ss << "fast_read: " << p->fast_read << "\n";
6564             break;
6565           case SCRUB_MIN_INTERVAL:
6566           case SCRUB_MAX_INTERVAL:
6567           case DEEP_SCRUB_INTERVAL:
6568           case RECOVERY_PRIORITY:
6569           case RECOVERY_OP_PRIORITY:
6570           case SCRUB_PRIORITY:
6571           case COMPRESSION_MODE:
6572           case COMPRESSION_ALGORITHM:
6573           case COMPRESSION_REQUIRED_RATIO:
6574           case COMPRESSION_MAX_BLOB_SIZE:
6575           case COMPRESSION_MIN_BLOB_SIZE:
6576           case CSUM_TYPE:
6577           case CSUM_MAX_BLOCK:
6578           case CSUM_MIN_BLOCK:
6579           case FINGERPRINT_ALGORITHM:
6580           case PG_NUM_MIN:
6581           case PG_NUM_MAX:
6582           case TARGET_SIZE_BYTES:
6583           case TARGET_SIZE_RATIO:
6584           case PG_AUTOSCALE_BIAS:
6585           case DEDUP_TIER:
6586           case DEDUP_CHUNK_ALGORITHM:
6587           case DEDUP_CDC_CHUNK_SIZE:
6588             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6589               if (i->second == *it)
6590                 break;
6591             }
6592             ceph_assert(i != ALL_CHOICES.end());
6593             {
6594               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6595               if (p->opts.is_set(key)) {
6596                 if(key == pool_opts_t::CSUM_TYPE) {
6597                   int64_t val;
6598                   p->opts.get(key, &val);
6599                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6600                 } else {
6601                   ss << i->first << ": " << p->opts.get(key) << "\n";
6602                 }
6603               }
6604             }
6605             break;
6606         }
6607         rdata.append(ss.str());
6608         ss.str("");
6609       }
6610     }
6611     r = 0;
6612   } else if (prefix == "osd pool get-quota") {
6613     string pool_name;
6614     cmd_getval(cmdmap, "pool", pool_name);
6615
6616     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6617     if (poolid < 0) {
6618       ceph_assert(poolid == -ENOENT);
6619       ss << "unrecognized pool '" << pool_name << "'";
6620       r = -ENOENT;
6621       goto reply;
6622     }
6623     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6624     const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6625     if (!pstat) {
6626       ss << "no stats for pool '" << pool_name << "'";
6627       r = -ENOENT;
6628       goto reply;
6629     }
6630     const object_stat_sum_t& sum = pstat->stats.sum;
6631     if (f) {
6632       f->open_object_section("pool_quotas");
6633       f->dump_string("pool_name", pool_name);
6634       f->dump_unsigned("pool_id", poolid);
6635       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6636       f->dump_int("current_num_objects", sum.num_objects);
6637       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6638       f->dump_int("current_num_bytes", sum.num_bytes);
6639       f->close_section();
6640       f->flush(rdata);
6641     } else {
6642       stringstream rs;
6643       rs << "quotas for pool '" << pool_name << "':\n"
6644          << "  max objects: ";
6645       if (p->quota_max_objects == 0)
6646         rs << "N/A";
6647       else {
6648         rs << si_u_t(p->quota_max_objects) << " objects";
6649         rs << "  (current num objects: " << sum.num_objects << " objects)";
6650       }
6651       rs << "\n"
6652          << "  max bytes  : ";
6653       if (p->quota_max_bytes == 0)
6654         rs << "N/A";
6655       else {
6656         rs << byte_u_t(p->quota_max_bytes);
6657         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6658       }
6659       rdata.append(rs.str());
6660     }
6661     rdata.append("\n");
6662     r = 0;
6663   } else if (prefix == "osd crush rule list" ||
6664              prefix == "osd crush rule ls") {
6665     if (f) {
6666       f->open_array_section("rules");
6667       osdmap.crush->list_rules(f.get());
6668       f->close_section();
6669       f->flush(rdata);
6670     } else {
6671       ostringstream ss;
6672       osdmap.crush->list_rules(&ss);
6673       rdata.append(ss.str());
6674     }
6675   } else if (prefix == "osd crush rule ls-by-class") {
6676     string class_name;
6677     cmd_getval(cmdmap, "class", class_name);
6678     if (class_name.empty()) {
6679       ss << "no class specified";
6680       r = -EINVAL;
6681       goto reply;
6682     }
6683     set<int> rules;
6684     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6685     if (r < 0) {
6686       ss << "failed to get rules by class '" << class_name << "'";
6687       goto reply;
6688     }
6689     if (f) {
6690       f->open_array_section("rules");
6691       for (auto &rule: rules) {
6692         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6693       }
6694       f->close_section();
6695       f->flush(rdata);
6696     } else {
6697       ostringstream rs;
6698       for (auto &rule: rules) {
6699         rs << osdmap.crush->get_rule_name(rule) << "\n";
6700       }
6701       rdata.append(rs.str());
6702     }
6703   } else if (prefix == "osd crush rule dump") {
6704     string name;
6705     cmd_getval(cmdmap, "name", name);
6706     string format;
6707     cmd_getval(cmdmap, "format", format);
6708     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6709     if (name == "") {
6710       f->open_array_section("rules");
6711       osdmap.crush->dump_rules(f.get());
6712       f->close_section();
6713     } else {
6714       int ruleno = osdmap.crush->get_rule_id(name);
6715       if (ruleno < 0) {
6716         ss << "unknown crush rule '" << name << "'";
6717         r = ruleno;
6718         goto reply;
6719       }
6720       osdmap.crush->dump_rule(ruleno, f.get());
6721     }
6722     ostringstream rs;
6723     f->flush(rs);
6724     rs << "\n";
6725     rdata.append(rs.str());
6726   } else if (prefix == "osd crush dump") {
6727     string format;
6728     cmd_getval(cmdmap, "format", format);
6729     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6730     f->open_object_section("crush_map");
6731     osdmap.crush->dump(f.get());
6732     f->close_section();
6733     ostringstream rs;
6734     f->flush(rs);
6735     rs << "\n";
6736     rdata.append(rs.str());
6737   } else if (prefix == "osd crush show-tunables") {
6738     string format;
6739     cmd_getval(cmdmap, "format", format);
6740     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6741     f->open_object_section("crush_map_tunables");
6742     osdmap.crush->dump_tunables(f.get());
6743     f->close_section();
6744     ostringstream rs;
6745     f->flush(rs);
6746     rs << "\n";
6747     rdata.append(rs.str());
6748   } else if (prefix == "osd crush tree") {
6749     bool show_shadow = false;
6750     if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6751       std::string shadow;
6752       if (cmd_getval(cmdmap, "shadow", shadow) &&
6753           shadow == "--show-shadow") {
6754         show_shadow = true;
6755       }
6756     }
6757     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6758     if (f) {
6759       f->open_object_section("crush_tree");
6760       osdmap.crush->dump_tree(nullptr,
6761                               f.get(),
6762                               osdmap.get_pool_names(),
6763                               show_shadow);
6764       f->close_section();
6765       f->flush(rdata);
6766     } else {
6767       ostringstream ss;
6768       osdmap.crush->dump_tree(&ss,
6769                               nullptr,
6770                               osdmap.get_pool_names(),
6771                               show_shadow);
6772       rdata.append(ss.str());
6773     }
6774   } else if (prefix == "osd crush ls") {
6775     string name;
6776     if (!cmd_getval(cmdmap, "node", name)) {
6777       ss << "no node specified";
6778       r = -EINVAL;
6779       goto reply;
6780     }
6781     if (!osdmap.crush->name_exists(name)) {
6782       ss << "node '" << name << "' does not exist";
6783       r = -ENOENT;
6784       goto reply;
6785     }
6786     int id = osdmap.crush->get_item_id(name);
6787     list<int> result;
6788     if (id >= 0) {
6789       result.push_back(id);
6790     } else {
6791       int num = osdmap.crush->get_bucket_size(id);
6792       for (int i = 0; i < num; ++i) {
6793         result.push_back(osdmap.crush->get_bucket_item(id, i));
6794       }
6795     }
6796     if (f) {
6797       f->open_array_section("items");
6798       for (auto i : result) {
6799         f->dump_string("item", osdmap.crush->get_item_name(i));
6800       }
6801       f->close_section();
6802       f->flush(rdata);
6803     } else {
6804       ostringstream ss;
6805       for (auto i : result) {
6806         ss << osdmap.crush->get_item_name(i) << "\n";
6807       }
6808       rdata.append(ss.str());
6809     }
6810     r = 0;
6811   } else if (prefix == "osd crush class ls") {
6812     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6813     f->open_array_section("crush_classes");
6814     for (auto i : osdmap.crush->class_name)
6815       f->dump_string("class", i.second);
6816     f->close_section();
6817     f->flush(rdata);
6818   } else if (prefix == "osd crush class ls-osd") {
6819     string name;
6820     cmd_getval(cmdmap, "class", name);
6821     set<int> osds;
6822     osdmap.crush->get_devices_by_class(name, &osds);
6823     if (f) {
6824       f->open_array_section("osds");
6825       for (auto &osd: osds)
6826         f->dump_int("osd", osd);
6827       f->close_section();
6828       f->flush(rdata);
6829     } else {
6830       bool first = true;
6831       for (auto &osd : osds) {
6832         if (!first)
6833           ds << "\n";
6834         first = false;
6835         ds << osd;
6836       }
6837       rdata.append(ds);
6838     }
6839   } else if (prefix == "osd crush get-device-class") {
6840     vector<string> idvec;
6841     cmd_getval(cmdmap, "ids", idvec);
6842     map<int, string> class_by_osd;
6843     for (auto& id : idvec) {
6844       ostringstream ts;
6845       long osd = parse_osd_id(id.c_str(), &ts);
6846       if (osd < 0) {
6847         ss << "unable to parse osd id:'" << id << "'";
6848         r = -EINVAL;
6849         goto reply;
6850       }
6851       auto device_class = osdmap.crush->get_item_class(osd);
6852       if (device_class)
6853         class_by_osd[osd] = device_class;
6854       else
6855         class_by_osd[osd] = ""; // no class
6856     }
6857     if (f) {
6858       f->open_array_section("osd_device_classes");
6859       for (auto& i : class_by_osd) {
6860         f->open_object_section("osd_device_class");
6861         f->dump_int("osd", i.first);
6862         f->dump_string("device_class", i.second);
6863         f->close_section();
6864       }
6865       f->close_section();
6866       f->flush(rdata);
6867     } else {
6868       if (class_by_osd.size() == 1) {
6869         // for single input, make a clean output
6870         ds << class_by_osd.begin()->second;
6871       } else {
6872         // note that we do not group osds by class here
6873         for (auto it = class_by_osd.begin();
6874              it != class_by_osd.end();
6875              it++) {
6876           ds << "osd." << it->first << ' ' << it->second;
6877           if (next(it) != class_by_osd.end())
6878             ds << '\n';
6879         }
6880       }
6881       rdata.append(ds);
6882     }
6883   } else if (prefix == "osd erasure-code-profile ls") {
6884     const auto &profiles = osdmap.get_erasure_code_profiles();
6885     if (f)
6886       f->open_array_section("erasure-code-profiles");
6887     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6888       if (f)
6889         f->dump_string("profile", i->first.c_str());
6890       else
6891         rdata.append(i->first + "\n");
6892     }
6893     if (f) {
6894       f->close_section();
6895       ostringstream rs;
6896       f->flush(rs);
6897       rs << "\n";
6898       rdata.append(rs.str());
6899     }
6900   } else if (prefix == "osd crush weight-set ls") {
6901     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6902     if (f) {
6903       f->open_array_section("weight_sets");
6904       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6905         f->dump_string("pool", "(compat)");
6906       }
6907       for (auto& i : osdmap.crush->choose_args) {
6908         if (i.first >= 0) {
6909           f->dump_string("pool", osdmap.get_pool_name(i.first));
6910         }
6911       }
6912       f->close_section();
6913       f->flush(rdata);
6914     } else {
6915       ostringstream rs;
6916       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6917         rs << "(compat)\n";
6918       }
6919       for (auto& i : osdmap.crush->choose_args) {
6920         if (i.first >= 0) {
6921           rs << osdmap.get_pool_name(i.first) << "\n";
6922         }
6923       }
6924       rdata.append(rs.str());
6925     }
6926   } else if (prefix == "osd crush weight-set dump") {
6927     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6928                                                      "json-pretty"));
6929     osdmap.crush->dump_choose_args(f.get());
6930     f->flush(rdata);
6931   } else if (prefix == "osd erasure-code-profile get") {
6932     string name;
6933     cmd_getval(cmdmap, "name", name);
6934     if (!osdmap.has_erasure_code_profile(name)) {
6935       ss << "unknown erasure code profile '" << name << "'";
6936       r = -ENOENT;
6937       goto reply;
6938     }
6939     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6940     if (f)
6941       f->open_object_section("profile");
6942     for (map<string,string>::const_iterator i = profile.begin();
6943          i != profile.end();
6944          ++i) {
6945       if (f)
6946         f->dump_string(i->first.c_str(), i->second.c_str());
6947       else
6948         rdata.append(i->first + "=" + i->second + "\n");
6949     }
6950     if (f) {
6951       f->close_section();
6952       ostringstream rs;
6953       f->flush(rs);
6954       rs << "\n";
6955       rdata.append(rs.str());
6956     }
6957   } else if (prefix == "osd pool application get") {
6958     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6959                                                      "json-pretty"));
6960     string pool_name;
6961     cmd_getval(cmdmap, "pool", pool_name);
6962     string app;
6963     cmd_getval(cmdmap, "app", app);
6964     string key;
6965     cmd_getval(cmdmap, "key", key);
6966
6967     if (pool_name.empty()) {
6968       // all
6969       f->open_object_section("pools");
6970       for (const auto &pool : osdmap.pools) {
6971         std::string name("<unknown>");
6972         const auto &pni = osdmap.pool_name.find(pool.first);
6973         if (pni != osdmap.pool_name.end())
6974           name = pni->second;
6975         f->open_object_section(name.c_str());
6976         for (auto &app_pair : pool.second.application_metadata) {
6977           f->open_object_section(app_pair.first.c_str());
6978           for (auto &kv_pair : app_pair.second) {
6979             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6980           }
6981           f->close_section();
6982         }
6983         f->close_section(); // name
6984       }
6985       f->close_section(); // pools
6986       f->flush(rdata);
6987     } else {
6988       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6989       if (pool < 0) {
6990         ss << "unrecognized pool '" << pool_name << "'";
6991         r = -ENOENT;
6992         goto reply;
6993       }
6994       auto p = osdmap.get_pg_pool(pool);
6995       // filter by pool
6996       if (app.empty()) {
6997         f->open_object_section(pool_name.c_str());
6998         for (auto &app_pair : p->application_metadata) {
6999           f->open_object_section(app_pair.first.c_str());
7000           for (auto &kv_pair : app_pair.second) {
7001             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7002           }
7003           f->close_section(); // application
7004         }
7005         f->close_section(); // pool_name
7006         f->flush(rdata);
7007         goto reply;
7008       }
7009
7010       auto app_it = p->application_metadata.find(app);
7011       if (app_it == p->application_metadata.end()) {
7012         ss << "pool '" << pool_name << "' has no application '" << app << "'";
7013         r = -ENOENT;
7014         goto reply;
7015       }
7016       // filter by pool + app
7017       if (key.empty()) {
7018         f->open_object_section(app_it->first.c_str());
7019         for (auto &kv_pair : app_it->second) {
7020           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7021         }
7022         f->close_section(); // application
7023         f->flush(rdata);
7024         goto reply;
7025       }
7026       // filter by pool + app + key
7027       auto key_it = app_it->second.find(key);
7028       if (key_it == app_it->second.end()) {
7029         ss << "application '" << app << "' on pool '" << pool_name
7030            << "' does not have key '" << key << "'";
7031         r = -ENOENT;
7032         goto reply;
7033       }
7034       ss << key_it->second << "\n";
7035       rdata.append(ss.str());
7036       ss.str("");
7037     }
7038   } else if (prefix == "osd get-require-min-compat-client") {
7039     ss << osdmap.require_min_compat_client << std::endl;
7040     rdata.append(ss.str());
7041     ss.str("");
7042     goto reply;
7043   } else if (prefix == "osd pool application enable" ||
7044              prefix == "osd pool application disable" ||
7045              prefix == "osd pool application set" ||
7046              prefix == "osd pool application rm") {
7047     bool changed = false;
7048     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7049     if (r != 0) {
7050       // Error, reply.
7051       goto reply;
7052     } else if (changed) {
7053       // Valid mutation, proceed to prepare phase
7054       return false;
7055     } else {
7056       // Idempotent case, reply
7057       goto reply;
7058     }
7059   } else {
7060     // try prepare update
7061     return false;
7062   }
7063
7064  reply:
7065   string rs;
7066   getline(ss, rs);
7067   mon.reply_command(op, r, rs, rdata, get_last_committed());
7068   return true;
7069 }
7070
7071 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7072 {
7073   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7074     osdmap.get_pg_pool(pool_id));
7075   ceph_assert(pool);
7076   pool->set_flag(flags);
7077 }
7078
7079 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7080 {
7081   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7082     osdmap.get_pg_pool(pool_id));
7083   ceph_assert(pool);
7084   pool->unset_flag(flags);
7085 }
7086
7087 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7088 {
7089   char k[80];
7090   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7091   return k;
7092 }
7093
7094 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7095 {
7096   char k[80];
7097   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7098            (unsigned long long)pool, (unsigned long long)snap);
7099   return k;
7100 }
7101
7102 string OSDMonitor::make_purged_snap_key_value(
7103   int64_t pool, snapid_t snap, snapid_t num,
7104   epoch_t epoch, bufferlist *v)
7105 {
7106   // encode the *last* epoch in the key so that we can use forward
7107   // iteration only to search for an epoch in an interval.
7108   encode(snap, *v);
7109   encode(snap + num, *v);
7110   encode(epoch, *v);
7111   return make_purged_snap_key(pool, snap + num - 1);
7112 }
7113
7114
7115 int OSDMonitor::lookup_purged_snap(
7116   int64_t pool, snapid_t snap,
7117   snapid_t *begin, snapid_t *end)
7118 {
7119   string k = make_purged_snap_key(pool, snap);
7120   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7121   it->lower_bound(k);
7122   if (!it->valid()) {
7123     dout(20) << __func__
7124              << " pool " << pool << " snap " << snap
7125              << " - key '" << k << "' not found" << dendl;
7126     return -ENOENT;
7127   }
7128   if (it->key().find("purged_snap_") != 0) {
7129     dout(20) << __func__
7130              << " pool " << pool << " snap " << snap
7131              << " - key '" << k << "' got '" << it->key()
7132              << "', wrong prefix" << dendl;
7133     return -ENOENT;
7134   }
7135   string gotk = it->key();
7136   const char *format = "purged_snap_%llu_";
7137   long long int keypool;
7138   int n = sscanf(gotk.c_str(), format, &keypool);
7139   if (n != 1) {
7140     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7141     return -ENOENT;
7142   }
7143   if (pool != keypool) {
7144     dout(20) << __func__
7145              << " pool " << pool << " snap " << snap
7146              << " - key '" << k << "' got '" << gotk
7147              << "', wrong pool " << keypool
7148              << dendl;
7149     return -ENOENT;
7150   }
7151   bufferlist v = it->value();
7152   auto p = v.cbegin();
7153   decode(*begin, p);
7154   decode(*end, p);
7155   if (snap < *begin || snap >= *end) {
7156     dout(20) << __func__
7157              << " pool " << pool << " snap " << snap
7158              << " - found [" << *begin << "," << *end << "), no overlap"
7159              << dendl;
7160     return -ENOENT;
7161   }
7162   return 0;
7163 }
7164
7165 void OSDMonitor::insert_purged_snap_update(
7166   int64_t pool,
7167   snapid_t start, snapid_t end,
7168   epoch_t epoch,
7169   MonitorDBStore::TransactionRef t)
7170 {
7171   snapid_t before_begin, before_end;
7172   snapid_t after_begin, after_end;
7173   int b = lookup_purged_snap(pool, start - 1,
7174                              &before_begin, &before_end);
7175   int a = lookup_purged_snap(pool, end,
7176                              &after_begin, &after_end);
7177   if (!b && !a) {
7178     dout(10) << __func__
7179              << " [" << start << "," << end << ") - joins ["
7180              << before_begin << "," << before_end << ") and ["
7181              << after_begin << "," << after_end << ")" << dendl;
7182     // erase only the begin record; we'll overwrite the end one.
7183     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7184     bufferlist v;
7185     string k = make_purged_snap_key_value(pool,
7186                                           before_begin, after_end - before_begin,
7187                                           pending_inc.epoch, &v);
7188     t->put(OSD_SNAP_PREFIX, k, v);
7189   } else if (!b) {
7190     dout(10) << __func__
7191              << " [" << start << "," << end << ") - join with earlier ["
7192              << before_begin << "," << before_end << ")" << dendl;
7193     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7194     bufferlist v;
7195     string k = make_purged_snap_key_value(pool,
7196                                           before_begin, end - before_begin,
7197                                           pending_inc.epoch, &v);
7198     t->put(OSD_SNAP_PREFIX, k, v);
7199   } else if (!a) {
7200     dout(10) << __func__
7201              << " [" << start << "," << end << ") - join with later ["
7202              << after_begin << "," << after_end << ")" << dendl;
7203     // overwrite after record
7204     bufferlist v;
7205     string k = make_purged_snap_key_value(pool,
7206                                           start, after_end - start,
7207                                           pending_inc.epoch, &v);
7208     t->put(OSD_SNAP_PREFIX, k, v);
7209   } else {
7210     dout(10) << __func__
7211              << " [" << start << "," << end << ") - new"
7212              << dendl;
7213     bufferlist v;
7214     string k = make_purged_snap_key_value(pool,
7215                                           start, end - start,
7216                                           pending_inc.epoch, &v);
7217     t->put(OSD_SNAP_PREFIX, k, v);
7218   }
7219 }
7220
7221 bool OSDMonitor::try_prune_purged_snaps()
7222 {
7223   if (!mon.mgrstatmon()->is_readable()) {
7224     return false;
7225   }
7226   if (!pending_inc.new_purged_snaps.empty()) {
7227     return false;  // we already pruned for this epoch
7228   }
7229
7230   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7231     "mon_max_snap_prune_per_epoch");
7232   if (!max_prune) {
7233     max_prune = 100000;
7234   }
7235   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7236
7237   unsigned actually_pruned = 0;
7238   auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7239   for (auto& p : osdmap.get_pools()) {
7240     auto q = purged_snaps.find(p.first);
7241     if (q == purged_snaps.end()) {
7242       continue;
7243     }
7244     auto& purged = q->second;
7245     if (purged.empty()) {
7246       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7247       continue;
7248     }
7249     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7250     snap_interval_set_t to_prune;
7251     unsigned maybe_pruned = actually_pruned;
7252     for (auto i = purged.begin(); i != purged.end(); ++i) {
7253       snapid_t begin = i.get_start();
7254       auto end = i.get_start() + i.get_len();
7255       snapid_t pbegin = 0, pend = 0;
7256       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7257       if (r == 0) {
7258         // already purged.
7259         // be a bit aggressive about backing off here, because the mon may
7260         // do a lot of work going through this set, and if we know the
7261         // purged set from the OSDs is at least *partly* stale we may as
7262         // well wait for it to be fresh.
7263         dout(20) << __func__ << "  we've already purged " << pbegin
7264                  << "~" << (pend - pbegin) << dendl;
7265         break;  // next pool
7266       }
7267       if (pbegin && pbegin > begin && pbegin < end) {
7268         // the tail of [begin,end) is purged; shorten the range
7269         end = pbegin;
7270       }
7271       to_prune.insert(begin, end - begin);
7272       maybe_pruned += end - begin;
7273       if (maybe_pruned >= max_prune) {
7274         break;
7275       }
7276     }
7277     if (!to_prune.empty()) {
7278       // PGs may still be reporting things as purged that we have already
7279       // pruned from removed_snaps_queue.
7280       snap_interval_set_t actual;
7281       auto r = osdmap.removed_snaps_queue.find(p.first);
7282       if (r != osdmap.removed_snaps_queue.end()) {
7283         actual.intersection_of(to_prune, r->second);
7284       }
7285       actually_pruned += actual.size();
7286       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7287                << ", actual pruned " << actual << dendl;
7288       if (!actual.empty()) {
7289         pending_inc.new_purged_snaps[p.first].swap(actual);
7290       }
7291     }
7292     if (actually_pruned >= max_prune) {
7293       break;
7294     }
7295   }
7296   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7297   return !!actually_pruned;
7298 }
7299
7300 bool OSDMonitor::update_pools_status()
7301 {
7302   if (!mon.mgrstatmon()->is_readable())
7303     return false;
7304
7305   bool ret = false;
7306
7307   auto& pools = osdmap.get_pools();
7308   for (auto it = pools.begin(); it != pools.end(); ++it) {
7309     const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7310     if (!pstat)
7311       continue;
7312     const object_stat_sum_t& sum = pstat->stats.sum;
7313     const pg_pool_t &pool = it->second;
7314     const string& pool_name = osdmap.get_pool_name(it->first);
7315
7316     bool pool_is_full =
7317       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7318       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7319
7320     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7321       if (pool_is_full)
7322         continue;
7323
7324       mon.clog->info() << "pool '" << pool_name
7325                        << "' no longer out of quota; removing NO_QUOTA flag";
7326       // below we cancel FLAG_FULL too, we'll set it again in
7327       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7328       clear_pool_flags(it->first,
7329                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7330       ret = true;
7331     } else {
7332       if (!pool_is_full)
7333         continue;
7334
7335       if (pool.quota_max_bytes > 0 &&
7336           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7337         mon.clog->warn() << "pool '" << pool_name << "' is full"
7338                          << " (reached quota's max_bytes: "
7339                          << byte_u_t(pool.quota_max_bytes) << ")";
7340       }
7341       if (pool.quota_max_objects > 0 &&
7342                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7343         mon.clog->warn() << "pool '" << pool_name << "' is full"
7344                          << " (reached quota's max_objects: "
7345                          << pool.quota_max_objects << ")";
7346       }
7347       // set both FLAG_FULL_QUOTA and FLAG_FULL
7348       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7349       // since FLAG_FULL should always take precedence
7350       set_pool_flags(it->first,
7351                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7352       clear_pool_flags(it->first,
7353                        pg_pool_t::FLAG_NEARFULL |
7354                        pg_pool_t::FLAG_BACKFILLFULL);
7355       ret = true;
7356     }
7357   }
7358   return ret;
7359 }
7360
7361 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7362 {
7363   op->mark_osdmon_event(__func__);
7364   auto m = op->get_req<MPoolOp>();
7365   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7366   MonSession *session = op->get_session();
7367   if (!session)
7368     return -EPERM;
7369   string erasure_code_profile;
7370   stringstream ss;
7371   string rule_name;
7372   bool bulk = false;
7373   int ret = 0;
7374   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7375                          0, 0, 0, 0, 0, 0, 0.0,
7376                          erasure_code_profile,
7377                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7378                          cct->_conf.get_val<bool>("osd_pool_default_crimson"),
7379                          &ss);
7380
7381   if (ret < 0) {
7382     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7383   }
7384   return ret;
7385 }
7386
7387 int OSDMonitor::crush_rename_bucket(const string& srcname,
7388                                     const string& dstname,
7389                                     ostream *ss)
7390 {
7391   int ret;
7392   //
7393   // Avoid creating a pending crush if it does not already exists and
7394   // the rename would fail.
7395   //
7396   if (!_have_pending_crush()) {
7397     ret = _get_stable_crush().can_rename_bucket(srcname,
7398                                                 dstname,
7399                                                 ss);
7400     if (ret)
7401       return ret;
7402   }
7403
7404   CrushWrapper newcrush = _get_pending_crush();
7405
7406   ret = newcrush.rename_bucket(srcname,
7407                                dstname,
7408                                ss);
7409   if (ret)
7410     return ret;
7411
7412   pending_inc.crush.clear();
7413   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7414   *ss << "renamed bucket " << srcname << " into " << dstname;
7415   return 0;
7416 }
7417
7418 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7419 {
7420   string replacement = "";
7421
7422   if (plugin == "jerasure_generic" ||
7423       plugin == "jerasure_sse3" ||
7424       plugin == "jerasure_sse4" ||
7425       plugin == "jerasure_neon") {
7426     replacement = "jerasure";
7427   } else if (plugin == "shec_generic" ||
7428              plugin == "shec_sse3" ||
7429              plugin == "shec_sse4" ||
7430              plugin == "shec_neon") {
7431     replacement = "shec";
7432   }
7433
7434   if (replacement != "") {
7435     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7436             << plugin << " that has been deprecated. Please use "
7437             << replacement << " instead." << dendl;
7438   }
7439 }
7440
7441 int OSDMonitor::normalize_profile(const string& profilename,
7442                                   ErasureCodeProfile &profile,
7443                                   bool force,
7444                                   ostream *ss)
7445 {
7446   ErasureCodeInterfaceRef erasure_code;
7447   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7448   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7449   check_legacy_ec_plugin(plugin->second, profilename);
7450   int err = instance.factory(plugin->second,
7451                              g_conf().get_val<std::string>("erasure_code_dir"),
7452                              profile, &erasure_code, ss);
7453   if (err) {
7454     return err;
7455   }
7456
7457   err = erasure_code->init(profile, ss);
7458   if (err) {
7459     return err;
7460   }
7461
7462   auto it = profile.find("stripe_unit");
7463   if (it != profile.end()) {
7464     string err_str;
7465     uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7466     if (!err_str.empty()) {
7467       *ss << "could not parse stripe_unit '" << it->second
7468           << "': " << err_str << std::endl;
7469       return -EINVAL;
7470     }
7471     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7472     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7473     if (chunk_size != stripe_unit) {
7474       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7475           << "alignment. Would be padded to " << chunk_size
7476           << std::endl;
7477       return -EINVAL;
7478     }
7479     if ((stripe_unit % 4096) != 0 && !force) {
7480       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7481           << "use --force to override this check" << std::endl;
7482       return -EINVAL;
7483     }
7484   }
7485   return 0;
7486 }
7487
7488 int OSDMonitor::crush_rule_create_erasure(const string &name,
7489                                              const string &profile,
7490                                              int *rule,
7491                                              ostream *ss)
7492 {
7493   int ruleid = osdmap.crush->get_rule_id(name);
7494   if (ruleid != -ENOENT) {
7495     *rule = ruleid;
7496     return -EEXIST;
7497   }
7498
7499   CrushWrapper newcrush = _get_pending_crush();
7500
7501   ruleid = newcrush.get_rule_id(name);
7502   if (ruleid != -ENOENT) {
7503     *rule = ruleid;
7504     return -EALREADY;
7505   } else {
7506     ErasureCodeInterfaceRef erasure_code;
7507     int err = get_erasure_code(profile, &erasure_code, ss);
7508     if (err) {
7509       *ss << "failed to load plugin using profile " << profile << std::endl;
7510       return err;
7511     }
7512
7513     err = erasure_code->create_rule(name, newcrush, ss);
7514     erasure_code.reset();
7515     if (err < 0)
7516       return err;
7517     *rule = err;
7518     pending_inc.crush.clear();
7519     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7520     return 0;
7521   }
7522 }
7523
7524 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7525                                  ErasureCodeInterfaceRef *erasure_code,
7526                                  ostream *ss) const
7527 {
7528   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7529     return -EAGAIN;
7530   ErasureCodeProfile profile =
7531     osdmap.get_erasure_code_profile(erasure_code_profile);
7532   ErasureCodeProfile::const_iterator plugin =
7533     profile.find("plugin");
7534   if (plugin == profile.end()) {
7535     *ss << "cannot determine the erasure code plugin"
7536         << " because there is no 'plugin' entry in the erasure_code_profile "
7537         << profile << std::endl;
7538     return -EINVAL;
7539   }
7540   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7541   auto& instance = ErasureCodePluginRegistry::instance();
7542   return instance.factory(plugin->second,
7543                           g_conf().get_val<std::string>("erasure_code_dir"),
7544                           profile, erasure_code, ss);
7545 }
7546
7547 int OSDMonitor::check_cluster_features(uint64_t features,
7548                                        stringstream &ss)
7549 {
7550   stringstream unsupported_ss;
7551   int unsupported_count = 0;
7552   if ((mon.get_quorum_con_features() & features) != features) {
7553     unsupported_ss << "the monitor cluster";
7554     ++unsupported_count;
7555   }
7556
7557   set<int32_t> up_osds;
7558   osdmap.get_up_osds(up_osds);
7559   for (set<int32_t>::iterator it = up_osds.begin();
7560        it != up_osds.end(); ++it) {
7561     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7562     if ((xi.features & features) != features) {
7563       if (unsupported_count > 0)
7564         unsupported_ss << ", ";
7565       unsupported_ss << "osd." << *it;
7566       unsupported_count ++;
7567     }
7568   }
7569
7570   if (unsupported_count > 0) {
7571     ss << "features " << features << " unsupported by: "
7572        << unsupported_ss.str();
7573     return -ENOTSUP;
7574   }
7575
7576   // check pending osd state, too!
7577   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7578          pending_inc.new_xinfo.begin();
7579        p != pending_inc.new_xinfo.end(); ++p) {
7580     const osd_xinfo_t &xi = p->second;
7581     if ((xi.features & features) != features) {
7582       dout(10) << __func__ << " pending osd." << p->first
7583                << " features are insufficient; retry" << dendl;
7584       return -EAGAIN;
7585     }
7586   }
7587
7588   return 0;
7589 }
7590
7591 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7592                                                  stringstream& ss)
7593 {
7594   OSDMap::Incremental new_pending = pending_inc;
7595   encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7596   OSDMap newmap;
7597   newmap.deepish_copy_from(osdmap);
7598   newmap.apply_incremental(new_pending);
7599
7600   // client compat
7601   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7602     auto mv = newmap.get_min_compat_client();
7603     if (mv > newmap.require_min_compat_client) {
7604       ss << "new crush map requires client version " << mv
7605          << " but require_min_compat_client is "
7606          << newmap.require_min_compat_client;
7607       return false;
7608     }
7609   }
7610
7611   // osd compat
7612   uint64_t features =
7613     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7614     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7615   stringstream features_ss;
7616   int r = check_cluster_features(features, features_ss);
7617   if (r) {
7618     ss << "Could not change CRUSH: " << features_ss.str();
7619     return false;
7620   }
7621
7622   return true;
7623 }
7624
7625 bool OSDMonitor::erasure_code_profile_in_use(
7626   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7627   const string &profile,
7628   ostream *ss)
7629 {
7630   bool found = false;
7631   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7632        p != pools.end();
7633        ++p) {
7634     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7635       *ss << osdmap.pool_name[p->first] << " ";
7636       found = true;
7637     }
7638   }
7639   if (found) {
7640     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7641   }
7642   return found;
7643 }
7644
7645 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7646                                            map<string,string> *erasure_code_profile_map,
7647                                            ostream *ss)
7648 {
7649   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7650                                    get_json_str_map,
7651                                    *ss,
7652                                    erasure_code_profile_map,
7653                                    true);
7654   if (r)
7655     return r;
7656   ceph_assert((*erasure_code_profile_map).count("plugin"));
7657   string default_plugin = (*erasure_code_profile_map)["plugin"];
7658   map<string,string> user_map;
7659   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7660        i != erasure_code_profile.end();
7661        ++i) {
7662     size_t equal = i->find('=');
7663     if (equal == string::npos) {
7664       user_map[*i] = string();
7665       (*erasure_code_profile_map)[*i] = string();
7666     } else {
7667       const string key = i->substr(0, equal);
7668       equal++;
7669       const string value = i->substr(equal);
7670       if (key.find("ruleset-") == 0) {
7671         *ss << "property '" << key << "' is no longer supported; try "
7672             << "'crush-" << key.substr(8) << "' instead";
7673         return -EINVAL;
7674       }
7675       user_map[key] = value;
7676       (*erasure_code_profile_map)[key] = value;
7677     }
7678   }
7679
7680   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7681     (*erasure_code_profile_map) = user_map;
7682
7683   return 0;
7684 }
7685
7686 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7687                                   const string &erasure_code_profile,
7688                                   uint8_t repl_size,
7689                                   unsigned *size, unsigned *min_size,
7690                                   ostream *ss)
7691 {
7692   int err = 0;
7693   bool set_min_size = false;
7694   switch (pool_type) {
7695   case pg_pool_t::TYPE_REPLICATED:
7696     if (osdmap.stretch_mode_enabled) {
7697       if (repl_size == 0)
7698         repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7699       if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7700         *ss << "prepare_pool_size: we are in stretch mode but size "
7701            << repl_size << " does not match!";
7702         return -EINVAL;
7703       }
7704       *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7705       set_min_size = true;
7706     }
7707     if (repl_size == 0) {
7708       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7709     }
7710     *size = repl_size;
7711     if (!set_min_size)
7712       *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7713     break;
7714   case pg_pool_t::TYPE_ERASURE:
7715     {
7716       if (osdmap.stretch_mode_enabled) {
7717         *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7718         return -EINVAL;
7719       }
7720       ErasureCodeInterfaceRef erasure_code;
7721       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7722       if (err == 0) {
7723         *size = erasure_code->get_chunk_count();
7724         *min_size =
7725           erasure_code->get_data_chunk_count() +
7726           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7727         assert(*min_size <= *size);
7728         assert(*min_size >= erasure_code->get_data_chunk_count());
7729       }
7730     }
7731     break;
7732   default:
7733     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7734     err = -EINVAL;
7735     break;
7736   }
7737   return err;
7738 }
7739
7740 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7741                                           const string &erasure_code_profile,
7742                                           uint32_t *stripe_width,
7743                                           ostream *ss)
7744 {
7745   int err = 0;
7746   switch (pool_type) {
7747   case pg_pool_t::TYPE_REPLICATED:
7748     // ignored
7749     break;
7750   case pg_pool_t::TYPE_ERASURE:
7751     {
7752       ErasureCodeProfile profile =
7753         osdmap.get_erasure_code_profile(erasure_code_profile);
7754       ErasureCodeInterfaceRef erasure_code;
7755       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7756       if (err)
7757         break;
7758       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7759       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7760       auto it = profile.find("stripe_unit");
7761       if (it != profile.end()) {
7762         string err_str;
7763         stripe_unit = strict_iecstrtoll(it->second, &err_str);
7764         ceph_assert(err_str.empty());
7765       }
7766       *stripe_width = data_chunks *
7767         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7768     }
7769     break;
7770   default:
7771     *ss << "prepare_pool_stripe_width: "
7772        << pool_type << " is not a known pool type";
7773     err = -EINVAL;
7774     break;
7775   }
7776   return err;
7777 }
7778
7779 int OSDMonitor::get_replicated_stretch_crush_rule()
7780 {
7781   /* we don't write down the stretch rule anywhere, so
7782    * we have to guess it. How? Look at all the pools
7783    * and count up how many times a given rule is used
7784    * on stretch pools and then return the one with
7785    * the most users!
7786    */
7787   map<int,int> rule_counts;
7788   for (const auto& pooli : osdmap.pools) {
7789     const pg_pool_t& p = pooli.second;
7790     if (p.is_replicated() && p.is_stretch_pool()) {
7791       if (!rule_counts.count(p.crush_rule)) {
7792         rule_counts[p.crush_rule] = 1;
7793       } else {
7794         ++rule_counts[p.crush_rule];
7795       }
7796     }
7797   }
7798
7799   if (rule_counts.empty()) {
7800     return -ENOENT;
7801   }
7802
7803   int most_used_count = 0;
7804   int most_used_rule = -1;
7805   for (auto i : rule_counts) {
7806     if (i.second > most_used_count) {
7807       most_used_rule = i.first;
7808       most_used_count = i.second;
7809     }
7810   }
7811   ceph_assert(most_used_count > 0);
7812   ceph_assert(most_used_rule >= 0);
7813   return most_used_rule;
7814 }
7815
7816 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7817                                         const string &erasure_code_profile,
7818                                         const string &rule_name,
7819                                         int *crush_rule,
7820                                         ostream *ss)
7821 {
7822
7823   if (*crush_rule < 0) {
7824     switch (pool_type) {
7825     case pg_pool_t::TYPE_REPLICATED:
7826       {
7827         if (rule_name == "") {
7828           if (osdmap.stretch_mode_enabled) {
7829             *crush_rule = get_replicated_stretch_crush_rule();
7830           } else {
7831             // Use default rule
7832             *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7833           }
7834           if (*crush_rule < 0) {
7835             // Errors may happen e.g. if no valid rule is available
7836             *ss << "No suitable CRUSH rule exists, check "
7837                 << "'osd pool default crush *' config options";
7838             return -ENOENT;
7839           }
7840         } else {
7841           return get_crush_rule(rule_name, crush_rule, ss);
7842         }
7843       }
7844       break;
7845     case pg_pool_t::TYPE_ERASURE:
7846       {
7847         int err = crush_rule_create_erasure(rule_name,
7848                                                erasure_code_profile,
7849                                                crush_rule, ss);
7850         switch (err) {
7851         case -EALREADY:
7852           dout(20) << "prepare_pool_crush_rule: rule "
7853                    << rule_name << " try again" << dendl;
7854           // fall through
7855         case 0:
7856           // need to wait for the crush rule to be proposed before proceeding
7857           err = -EAGAIN;
7858           break;
7859         case -EEXIST:
7860           err = 0;
7861           break;
7862         }
7863         return err;
7864       }
7865       break;
7866     default:
7867       *ss << "prepare_pool_crush_rule: " << pool_type
7868          << " is not a known pool type";
7869       return -EINVAL;
7870     }
7871   } else {
7872     if (!osdmap.crush->rule_exists(*crush_rule)) {
7873       *ss << "CRUSH rule " << *crush_rule << " not found";
7874       return -ENOENT;
7875     }
7876   }
7877
7878   return 0;
7879 }
7880
7881 int OSDMonitor::get_crush_rule(const string &rule_name,
7882                                int *crush_rule,
7883                                ostream *ss)
7884 {
7885   int ret;
7886   ret = osdmap.crush->get_rule_id(rule_name);
7887   if (ret != -ENOENT) {
7888     // found it, use it
7889     *crush_rule = ret;
7890   } else {
7891     CrushWrapper newcrush = _get_pending_crush();
7892
7893     ret = newcrush.get_rule_id(rule_name);
7894     if (ret != -ENOENT) {
7895       // found it, wait for it to be proposed
7896       dout(20) << __func__ << ": rule " << rule_name
7897                << " try again" << dendl;
7898       return -EAGAIN;
7899     } else {
7900       // Cannot find it , return error
7901       *ss << "specified rule " << rule_name << " doesn't exist";
7902       return ret;
7903     }
7904   }
7905   return 0;
7906 }
7907
7908 /*
7909 * Get the number of 'in' osds according to the crush_rule,
7910 */
7911 uint32_t OSDMonitor::get_osd_num_by_crush(int crush_rule)
7912 {
7913   set<int> out_osds;
7914   set<int> crush_in_osds;
7915   set<int> roots;
7916   CrushWrapper newcrush = _get_pending_crush();
7917   newcrush.find_takes_by_rule(crush_rule, &roots);
7918   for (auto root : roots) {
7919     const char *rootname = newcrush.get_item_name(root);
7920     set<int> crush_all_osds;
7921     newcrush.get_leaves(rootname, &crush_all_osds);
7922     std::set_difference(crush_all_osds.begin(), crush_all_osds.end(),
7923                         out_osds.begin(), out_osds.end(),
7924                         std::inserter(crush_in_osds, crush_in_osds.end()));
7925   }
7926   return crush_in_osds.size();
7927 }
7928
7929 int OSDMonitor::check_pg_num(int64_t pool,
7930                              int pg_num,
7931                              int size,
7932                              int crush_rule,
7933                              ostream *ss)
7934 {
7935   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7936   uint64_t projected = 0;
7937   uint32_t osd_num_by_crush = 0;
7938   set<int64_t> crush_pool_ids;
7939   if (pool < 0) {
7940     // a new pool
7941     projected += pg_num * size;
7942   }
7943
7944   osd_num_by_crush = get_osd_num_by_crush(crush_rule);
7945   osdmap.get_pool_ids_by_rule(crush_rule, &crush_pool_ids);
7946
7947   for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7948     // Check only for pools affected by crush rule
7949     if (crush_pool_ids.contains(pool_id)) {
7950       if (pool_id == pool) {
7951         // Specified pool, use given pg_num and size values.
7952         projected += pg_num * size;
7953       } else {
7954         // Use pg_num_target for evaluating the projected pg num
7955         projected += pool_info.get_pg_num_target() * pool_info.get_size();
7956       }
7957     }
7958   }
7959   // assume min cluster size 3
7960   osd_num_by_crush = std::max(osd_num_by_crush, 3u);
7961   auto projected_pgs_per_osd = projected / osd_num_by_crush;
7962
7963   if (projected_pgs_per_osd > max_pgs_per_osd) {
7964     if (pool >= 0) {
7965       *ss << "pool id " << pool;
7966     }
7967     *ss << " pg_num " << pg_num
7968         << " size " << size
7969         << " for this pool would result in "
7970         << projected_pgs_per_osd
7971         << " cumulative PGs per OSD (" << projected
7972         << " total PG replicas on " << osd_num_by_crush
7973         << " 'in' root OSDs by crush rule) "
7974         << "which exceeds the mon_max_pg_per_osd "
7975         << "value of " << max_pgs_per_osd;
7976     return -ERANGE;
7977   }
7978   return 0;
7979 }
7980
7981 /**
7982  * @param name The name of the new pool
7983  * @param crush_rule The crush rule to use. If <0, will use the system default
7984  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7985  * @param pg_num The pg_num to use. If set to 0, will use the system default
7986  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7987  * @param pg_num_min min pg_num
7988  * @param pg_num_max max pg_num
7989  * @param repl_size Replication factor, or 0 for default
7990  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7991  * @param pool_type TYPE_ERASURE, or TYPE_REP
7992  * @param expected_num_objects expected number of objects on the pool
7993  * @param fast_read fast read type.
7994  * @param pg_autoscale_mode autoscale mode, one of on, off, warn
7995  * @param bool bulk indicates whether pool should be a bulk pool
7996  * @param bool crimson indicates whether pool is a crimson pool
7997  * @param ss human readable error message, if any.
7998  *
7999  * @return 0 on success, negative errno on failure.
8000  */
8001 int OSDMonitor::prepare_new_pool(string& name,
8002                                  int crush_rule,
8003                                  const string &crush_rule_name,
8004                                  unsigned pg_num, unsigned pgp_num,
8005                                  unsigned pg_num_min,
8006                                  unsigned pg_num_max,
8007                                  const uint64_t repl_size,
8008                                  const uint64_t target_size_bytes,
8009                                  const float target_size_ratio,
8010                                  const string &erasure_code_profile,
8011                                  const unsigned pool_type,
8012                                  const uint64_t expected_num_objects,
8013                                  FastReadType fast_read,
8014                                  string pg_autoscale_mode,
8015                                  bool bulk,
8016                                  bool crimson,
8017                                  ostream *ss)
8018 {
8019   if (crimson && pg_autoscale_mode.empty()) {
8020     // default pg_autoscale_mode to off for crimson, we'll error out below if
8021     // the user tried to actually set pg_autoscale_mode to something other than
8022     // "off"
8023     pg_autoscale_mode = "off";
8024   }
8025
8026   if (name.length() == 0)
8027     return -EINVAL;
8028
8029   if (pg_num == 0) {
8030     auto pg_num_from_mode =
8031       [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8032       (const string& mode) {
8033       return mode == "on" ? 1 : pg_num;
8034     };
8035     pg_num = pg_num_from_mode(
8036       pg_autoscale_mode.empty() ?
8037       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8038       pg_autoscale_mode);
8039   }
8040   if (pgp_num == 0)
8041     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8042   if (!pgp_num)
8043     pgp_num = pg_num;
8044   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8045     *ss << "'pg_num' must be greater than 0 and less than or equal to "
8046         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8047         << " (you may adjust 'mon max pool pg num' for higher values)";
8048     return -ERANGE;
8049   }
8050   if (pgp_num > pg_num) {
8051     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8052         << ", which in this case is " << pg_num;
8053     return -ERANGE;
8054   }
8055
8056   if (crimson) {
8057     /* crimson-osd requires that the pool be replicated and that pg_num/pgp_num
8058      * be static.  User must also have specified set-allow-crimson */
8059     const auto *suffix = " (--crimson specified or osd_pool_default_crimson set)";
8060     if (pool_type != pg_pool_t::TYPE_REPLICATED) {
8061       *ss << "crimson-osd only supports replicated pools" << suffix;
8062       return -EINVAL;
8063     } else if (pg_autoscale_mode != "off") {
8064       *ss << "crimson-osd does not support changing pg_num or pgp_num, "
8065           << "pg_autoscale_mode must be set to 'off'" << suffix;
8066       return -EINVAL;
8067     } else if (!osdmap.get_allow_crimson()) {
8068       *ss << "set-allow-crimson must be set to create a pool with the "
8069           << "crimson flag" << suffix;
8070       return -EINVAL;
8071     }
8072   }
8073
8074   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8075     *ss << "'fast_read' can only apply to erasure coding pool";
8076     return -EINVAL;
8077   }
8078   int r;
8079   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8080                                  crush_rule_name, &crush_rule, ss);
8081   if (r) {
8082     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8083     return r;
8084   }
8085   unsigned size, min_size;
8086   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8087                         &size, &min_size, ss);
8088   if (r) {
8089     dout(10) << "prepare_pool_size returns " << r << dendl;
8090     return r;
8091   }
8092   if (g_conf()->mon_osd_crush_smoke_test) {
8093     CrushWrapper newcrush = _get_pending_crush();
8094     ostringstream err;
8095     CrushTester tester(newcrush, err);
8096     tester.set_min_x(0);
8097     tester.set_max_x(50);
8098     tester.set_rule(crush_rule);
8099     tester.set_num_rep(size);
8100     auto start = ceph::coarse_mono_clock::now();
8101     r = tester.test_with_fork(cct, g_conf()->mon_lease);
8102     dout(10) << __func__ << " crush test_with_fork tester created " << dendl;
8103     auto duration = ceph::coarse_mono_clock::now() - start;
8104     if (r < 0) {
8105       dout(10) << "tester.test_with_fork returns " << r
8106                << ": " << err.str() << dendl;
8107       *ss << "crush test failed with " << r << ": " << err.str();
8108       return r;
8109     }
8110     dout(10) << __func__ << " crush smoke test duration: "
8111              << duration << dendl;
8112   }
8113   r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8114   if (r) {
8115     dout(10) << "check_pg_num returns " << r << dendl;
8116     return r;
8117   }
8118
8119   if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8120     *ss << "crush rule " << crush_rule << " type does not match pool";
8121     return -EINVAL;
8122   }
8123
8124   uint32_t stripe_width = 0;
8125   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8126   if (r) {
8127     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8128     return r;
8129   }
8130
8131   bool fread = false;
8132   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8133     switch (fast_read) {
8134       case FAST_READ_OFF:
8135         fread = false;
8136         break;
8137       case FAST_READ_ON:
8138         fread = true;
8139         break;
8140       case FAST_READ_DEFAULT:
8141         fread = g_conf()->osd_pool_default_ec_fast_read;
8142         break;
8143       default:
8144         *ss << "invalid fast_read setting: " << fast_read;
8145         return -EINVAL;
8146     }
8147   }
8148
8149   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8150        p != pending_inc.new_pool_names.end();
8151        ++p) {
8152     if (p->second == name)
8153       return 0;
8154   }
8155
8156   if (-1 == pending_inc.new_pool_max)
8157     pending_inc.new_pool_max = osdmap.pool_max;
8158   int64_t pool = ++pending_inc.new_pool_max;
8159   pg_pool_t empty;
8160   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8161   pi->create_time = ceph_clock_now();
8162   pi->type = pool_type;
8163   pi->fast_read = fread;
8164   pi->flags = g_conf()->osd_pool_default_flags;
8165   if (bulk) {
8166     pi->set_flag(pg_pool_t::FLAG_BULK);
8167   } else if (g_conf()->osd_pool_default_flag_bulk) {
8168       pi->set_flag(pg_pool_t::FLAG_BULK);
8169   }
8170   if (g_conf()->osd_pool_default_flag_hashpspool)
8171     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8172   if (g_conf()->osd_pool_default_flag_nodelete)
8173     pi->set_flag(pg_pool_t::FLAG_NODELETE);
8174   if (g_conf()->osd_pool_default_flag_nopgchange)
8175     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8176   if (g_conf()->osd_pool_default_flag_nosizechange)
8177     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8178   pi->set_flag(pg_pool_t::FLAG_CREATING);
8179   if (g_conf()->osd_pool_use_gmt_hitset)
8180     pi->use_gmt_hitset = true;
8181   else
8182     pi->use_gmt_hitset = false;
8183   if (crimson) {
8184     pi->set_flag(pg_pool_t::FLAG_CRIMSON);
8185     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8186   }
8187
8188   pi->size = size;
8189   pi->min_size = min_size;
8190   pi->crush_rule = crush_rule;
8191   pi->expected_num_objects = expected_num_objects;
8192   pi->object_hash = CEPH_STR_HASH_RJENKINS;
8193   if (osdmap.stretch_mode_enabled) {
8194     pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8195     pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8196     pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8197     pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8198     if (osdmap.degraded_stretch_mode) {
8199       pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8200       pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8201       // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8202       // TODO: drat, we don't record this ^ anywhere, though given that it
8203       // necessarily won't exist elsewhere it likely doesn't matter
8204       pi->min_size = pi->min_size / 2;
8205       pi->size = pi->size / 2; // only support 2 zones now
8206     }
8207   }
8208
8209   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8210         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8211       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8212     pi->pg_autoscale_mode = m;
8213   } else {
8214     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8215   }
8216   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8217   pi->set_pg_num(
8218     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8219     : pg_num);
8220   pi->set_pg_num_pending(pi->get_pg_num());
8221   pi->set_pg_num_target(pg_num);
8222   pi->set_pgp_num(pi->get_pg_num());
8223   pi->set_pgp_num_target(pgp_num);
8224   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8225       pg_num_min) {
8226     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8227   }
8228   if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8229       pg_num_max) {
8230     pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8231   }
8232   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8233         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8234     pi->pg_autoscale_mode = m;
8235   }
8236
8237   pi->last_change = pending_inc.epoch;
8238   pi->auid = 0;
8239
8240   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8241       pi->erasure_code_profile = erasure_code_profile;
8242   } else {
8243       pi->erasure_code_profile = "";
8244   }
8245   pi->stripe_width = stripe_width;
8246
8247   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8248       target_size_bytes) {
8249     // only store for nautilus+ because TARGET_SIZE_BYTES may be
8250     // larger than int32_t max.
8251     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8252   }
8253   if (target_size_ratio > 0.0 &&
8254       osdmap.require_osd_release >= ceph_release_t::nautilus) {
8255     // only store for nautilus+, just to be consistent and tidy.
8256     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8257   }
8258
8259   pi->cache_target_dirty_ratio_micro =
8260     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8261   pi->cache_target_dirty_high_ratio_micro =
8262     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8263   pi->cache_target_full_ratio_micro =
8264     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8265   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8266   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8267
8268   pending_inc.new_pool_names[pool] = name;
8269   return 0;
8270 }
8271
8272 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8273 {
8274   op->mark_osdmon_event(__func__);
8275   ostringstream ss;
8276   if (pending_inc.new_flags < 0)
8277     pending_inc.new_flags = osdmap.get_flags();
8278   pending_inc.new_flags |= flag;
8279   ss << OSDMap::get_flag_string(flag) << " is set";
8280   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8281                                                     get_last_committed() + 1));
8282   return true;
8283 }
8284
8285 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8286 {
8287   op->mark_osdmon_event(__func__);
8288   ostringstream ss;
8289   if (pending_inc.new_flags < 0)
8290     pending_inc.new_flags = osdmap.get_flags();
8291   pending_inc.new_flags &= ~flag;
8292   ss << OSDMap::get_flag_string(flag) << " is unset";
8293   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8294                                                     get_last_committed() + 1));
8295   return true;
8296 }
8297
8298 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8299                                          stringstream& ss)
8300 {
8301   string poolstr;
8302   cmd_getval(cmdmap, "pool", poolstr);
8303   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8304   if (pool < 0) {
8305     ss << "unrecognized pool '" << poolstr << "'";
8306     return -ENOENT;
8307   }
8308   string var;
8309   cmd_getval(cmdmap, "var", var);
8310
8311   pg_pool_t p = *osdmap.get_pg_pool(pool);
8312   if (pending_inc.new_pools.count(pool))
8313     p = pending_inc.new_pools[pool];
8314
8315   // accept val as a json string in the normal case (current
8316   // generation monitor).  parse out int or float values from the
8317   // string as needed.  however, if it is not a string, try to pull
8318   // out an int, in case an older monitor with an older json schema is
8319   // forwarding a request.
8320   string val;
8321   string interr, floaterr;
8322   int64_t n = 0;
8323   double f = 0;
8324   int64_t uf = 0;  // micro-f
8325   cmd_getval(cmdmap, "val", val);
8326
8327   auto si_options = {
8328     "target_max_objects"
8329   };
8330   auto iec_options = {
8331     "target_max_bytes",
8332     "target_size_bytes",
8333     "compression_max_blob_size",
8334     "compression_min_blob_size",
8335     "csum_max_block",
8336     "csum_min_block",
8337   };
8338   if (count(begin(si_options), end(si_options), var)) {
8339     n = strict_si_cast<int64_t>(val, &interr);
8340   } else if (count(begin(iec_options), end(iec_options), var)) {
8341     n = strict_iec_cast<int64_t>(val, &interr);
8342   } else {
8343     // parse string as both int and float; different fields use different types.
8344     n = strict_strtoll(val.c_str(), 10, &interr);
8345     f = strict_strtod(val.c_str(), &floaterr);
8346     uf = llrintl(f * (double)1000000.0);
8347   }
8348
8349   if (!p.is_tier() &&
8350       (var == "hit_set_type" || var == "hit_set_period" ||
8351        var == "hit_set_count" || var == "hit_set_fpp" ||
8352        var == "target_max_objects" || var == "target_max_bytes" ||
8353        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8354        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8355        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8356        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8357        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8358     return -EACCES;
8359   }
8360
8361   if (var == "size") {
8362     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8363       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8364       return -EPERM;
8365     }
8366     if (p.type == pg_pool_t::TYPE_ERASURE) {
8367       ss << "can not change the size of an erasure-coded pool";
8368       return -ENOTSUP;
8369     }
8370     if (interr.length()) {
8371       ss << "error parsing integer value '" << val << "': " << interr;
8372       return -EINVAL;
8373     }
8374     if (n <= 0 || n > 10) {
8375       ss << "pool size must be between 1 and 10";
8376       return -EINVAL;
8377     }
8378     if (n == 1) {
8379       if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8380         ss << "configuring pool size as 1 is disabled by default.";
8381         return -EPERM;
8382       }
8383       bool sure = false;
8384       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8385       if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8386         "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8387           "pass the flag --yes-i-really-mean-it.";
8388         return -EPERM;
8389       }
8390     }
8391     if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8392       ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8393       return -EINVAL;
8394     }
8395     if (n > p.size) {
8396       // only when increasing pool size
8397       int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8398       if (r < 0) {
8399         return r;
8400       }
8401     }
8402     p.size = n;
8403     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8404   } else if (var == "min_size") {
8405     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8406       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8407       return -EPERM;
8408     }
8409     if (interr.length()) {
8410       ss << "error parsing integer value '" << val << "': " << interr;
8411       return -EINVAL;
8412     }
8413
8414     if (p.type != pg_pool_t::TYPE_ERASURE) {
8415       if (n < 1 || n > p.size) {
8416         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8417         return -EINVAL;
8418       }
8419     } else {
8420        ErasureCodeInterfaceRef erasure_code;
8421        int k;
8422        stringstream tmp;
8423        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8424        if (err == 0) {
8425          k = erasure_code->get_data_chunk_count();
8426        } else {
8427          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8428          return err;
8429        }
8430
8431        if (n < k || n > p.size) {
8432          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8433          return -EINVAL;
8434        }
8435     }
8436     p.min_size = n;
8437   } else if (var == "pg_num_actual") {
8438     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8439       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8440       return -EPERM;
8441     }
8442     if (interr.length()) {
8443       ss << "error parsing integer value '" << val << "': " << interr;
8444       return -EINVAL;
8445     }
8446     if (n == (int)p.get_pg_num()) {
8447       return 0;
8448     }
8449     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8450       ss << "'pg_num' must be greater than 0 and less than or equal to "
8451          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8452          << " (you may adjust 'mon max pool pg num' for higher values)";
8453       return -ERANGE;
8454     }
8455     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8456       ss << "cannot adjust pg_num while initial PGs are being created";
8457       return -EBUSY;
8458     }
8459     if (n > (int)p.get_pg_num()) {
8460       if (p.get_pg_num() != p.get_pg_num_pending()) {
8461         // force pre-nautilus clients to resend their ops, since they
8462         // don't understand pg_num_pending changes form a new interval
8463         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8464       }
8465       p.set_pg_num(n);
8466     } else {
8467       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8468         ss << "nautilus OSDs are required to adjust pg_num_pending";
8469         return -EPERM;
8470       }
8471       if (n < (int)p.get_pgp_num()) {
8472         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8473         return -EINVAL;
8474       }
8475       if (n < (int)p.get_pg_num() - 1) {
8476         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8477            << ") - 1; only single pg decrease is currently supported";
8478         return -EINVAL;
8479       }
8480       p.set_pg_num_pending(n);
8481       // force pre-nautilus clients to resend their ops, since they
8482       // don't understand pg_num_pending changes form a new interval
8483       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8484     }
8485     // force pre-luminous clients to resend their ops, since they
8486     // don't understand that split PGs now form a new interval.
8487     p.last_force_op_resend_preluminous = pending_inc.epoch;
8488   } else if (var == "pg_num") {
8489     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8490       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8491       return -EPERM;
8492     }
8493     if (interr.length()) {
8494       ss << "error parsing integer value '" << val << "': " << interr;
8495       return -EINVAL;
8496     }
8497     if (n == (int)p.get_pg_num_target()) {
8498       return 0;
8499     }
8500     if (n <= 0 || static_cast<uint64_t>(n) >
8501                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8502       ss << "'pg_num' must be greater than 0 and less than or equal to "
8503          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8504          << " (you may adjust 'mon max pool pg num' for higher values)";
8505       return -ERANGE;
8506     }
8507     if (n > (int)p.get_pg_num_target()) {
8508       int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8509       if (r) {
8510         return r;
8511       }
8512       bool force = false;
8513       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8514       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8515         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8516         return -EPERM;
8517       }
8518     } else {
8519       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8520         ss << "nautilus OSDs are required to decrease pg_num";
8521         return -EPERM;
8522       }
8523     }
8524     int64_t pg_min = 0, pg_max = 0;
8525     p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8526     p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8527     if (pg_min && n < pg_min) {
8528       ss << "specified pg_num " << n
8529          << " < pg_num_min " << pg_min;
8530       return -EINVAL;
8531     }
8532     if (pg_max && n > pg_max) {
8533       ss << "specified pg_num " << n
8534          << " < pg_num_max " << pg_max;
8535       return -EINVAL;
8536     }
8537     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8538       // pre-nautilus osdmap format; increase pg_num directly
8539       assert(n > (int)p.get_pg_num());
8540       // force pre-nautilus clients to resend their ops, since they
8541       // don't understand pg_num_target changes form a new interval
8542       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8543       // force pre-luminous clients to resend their ops, since they
8544       // don't understand that split PGs now form a new interval.
8545       p.last_force_op_resend_preluminous = pending_inc.epoch;
8546       p.set_pg_num(n);
8547     } else {
8548       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8549       // make pgp_num track pg_num if it already matches.  if it is set
8550       // differently, leave it different and let the user control it
8551       // manually.
8552       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8553         p.set_pgp_num_target(n);
8554       }
8555       p.set_pg_num_target(n);
8556     }
8557   } else if (var == "pgp_num_actual") {
8558     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8559       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8560       return -EPERM;
8561     }
8562     if (interr.length()) {
8563       ss << "error parsing integer value '" << val << "': " << interr;
8564       return -EINVAL;
8565     }
8566     if (n <= 0) {
8567       ss << "specified pgp_num must > 0, but you set to " << n;
8568       return -EINVAL;
8569     }
8570     if (n > (int)p.get_pg_num()) {
8571       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8572       return -EINVAL;
8573     }
8574     if (n > (int)p.get_pg_num_pending()) {
8575       ss << "specified pgp_num " << n
8576          << " > pg_num_pending " << p.get_pg_num_pending();
8577       return -EINVAL;
8578     }
8579     p.set_pgp_num(n);
8580   } else if (var == "pgp_num") {
8581     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8582       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8583       return -EPERM;
8584     }
8585     if (interr.length()) {
8586       ss << "error parsing integer value '" << val << "': " << interr;
8587       return -EINVAL;
8588     }
8589     if (n <= 0) {
8590       ss << "specified pgp_num must > 0, but you set to " << n;
8591       return -EINVAL;
8592     }
8593     if (n > (int)p.get_pg_num_target()) {
8594       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8595       return -EINVAL;
8596     }
8597     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8598       // pre-nautilus osdmap format; increase pgp_num directly
8599       p.set_pgp_num(n);
8600     } else {
8601       p.set_pgp_num_target(n);
8602     }
8603   } else if (var == "pg_autoscale_mode") {
8604     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8605     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8606       ss << "specified invalid mode " << val;
8607       return -EINVAL;
8608     }
8609     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8610       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8611       return -EINVAL;
8612     }
8613     p.pg_autoscale_mode = m;
8614   } else if (var == "crush_rule") {
8615     int id = osdmap.crush->get_rule_id(val);
8616     if (id == -ENOENT) {
8617       ss << "crush rule " << val << " does not exist";
8618       return -ENOENT;
8619     }
8620     if (id < 0) {
8621       ss << cpp_strerror(id);
8622       return -ENOENT;
8623     }
8624     if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8625       ss << "crush rule " << id << " type does not match pool";
8626       return -EINVAL;
8627     }
8628     p.crush_rule = id;
8629   } else if (var == "nodelete" || var == "nopgchange" ||
8630              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8631              var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8632     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8633     // make sure we only compare against 'n' if we didn't receive a string
8634     if (val == "true" || (interr.empty() && n == 1)) {
8635       p.set_flag(flag);
8636     } else if (val == "false" || (interr.empty() && n == 0)) {
8637       if (flag == pg_pool_t::FLAG_NOPGCHANGE && p.is_crimson()) {
8638         ss << "cannot clear FLAG_NOPGCHANGE on a crimson pool";
8639         return -EINVAL;
8640       }
8641       p.unset_flag(flag);
8642     } else {
8643       ss << "expecting value 'true', 'false', '0', or '1'";
8644       return -EINVAL;
8645     }
8646   } else if (var == "eio") {
8647     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8648
8649     // make sure we only compare against 'n' if we didn't receive a string
8650     if (val == "true" || (interr.empty() && n == 1)) {
8651       p.set_flag(flag);
8652     } else if (val == "false" || (interr.empty() && n == 0)) {
8653       p.unset_flag(flag);
8654     } else {
8655       ss << "expecting value 'true', 'false', '0', or '1'";
8656       return -EINVAL;
8657     }
8658   } else if (var == "hashpspool") {
8659     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8660     bool force = false;
8661     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8662
8663     if (!force) {
8664       ss << "are you SURE?  this will remap all placement groups in this pool,"
8665             " this triggers large data movement,"
8666             " pass --yes-i-really-mean-it if you really do.";
8667       return -EPERM;
8668     }
8669     // make sure we only compare against 'n' if we didn't receive a string
8670     if (val == "true" || (interr.empty() && n == 1)) {
8671       p.set_flag(flag);
8672     } else if (val == "false" || (interr.empty() && n == 0)) {
8673       p.unset_flag(flag);
8674     } else {
8675       ss << "expecting value 'true', 'false', '0', or '1'";
8676       return -EINVAL;
8677     }
8678   } else if (var == "hit_set_type") {
8679     if (val == "none")
8680       p.hit_set_params = HitSet::Params();
8681     else {
8682       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8683       if (err)
8684         return err;
8685       if (val == "bloom") {
8686         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8687         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8688         p.hit_set_params = HitSet::Params(bsp);
8689       } else if (val == "explicit_hash")
8690         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8691       else if (val == "explicit_object")
8692         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8693       else {
8694         ss << "unrecognized hit_set type '" << val << "'";
8695         return -EINVAL;
8696       }
8697     }
8698   } else if (var == "hit_set_period") {
8699     if (interr.length()) {
8700       ss << "error parsing integer value '" << val << "': " << interr;
8701       return -EINVAL;
8702     } else if (n < 0) {
8703       ss << "hit_set_period should be non-negative";
8704       return -EINVAL;
8705     }
8706     p.hit_set_period = n;
8707   } else if (var == "hit_set_count") {
8708     if (interr.length()) {
8709       ss << "error parsing integer value '" << val << "': " << interr;
8710       return -EINVAL;
8711     } else if (n < 0) {
8712       ss << "hit_set_count should be non-negative";
8713       return -EINVAL;
8714     }
8715     p.hit_set_count = n;
8716   } else if (var == "hit_set_fpp") {
8717     if (floaterr.length()) {
8718       ss << "error parsing floating point value '" << val << "': " << floaterr;
8719       return -EINVAL;
8720     } else if (f < 0 || f > 1.0) {
8721       ss << "hit_set_fpp should be in the range 0..1";
8722       return -EINVAL;
8723     }
8724     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8725       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8726       return -EINVAL;
8727     }
8728     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8729     bloomp->set_fpp(f);
8730   } else if (var == "use_gmt_hitset") {
8731     if (val == "true" || (interr.empty() && n == 1)) {
8732       p.use_gmt_hitset = true;
8733     } else {
8734       ss << "expecting value 'true' or '1'";
8735       return -EINVAL;
8736     }
8737   } else if (var == "allow_ec_overwrites") {
8738     if (!p.is_erasure()) {
8739       ss << "ec overwrites can only be enabled for an erasure coded pool";
8740       return -EINVAL;
8741     }
8742     stringstream err;
8743     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8744         !is_pool_currently_all_bluestore(pool, p, &err)) {
8745       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8746       return -EINVAL;
8747     }
8748     if (val == "true" || (interr.empty() && n == 1)) {
8749         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8750     } else if (val == "false" || (interr.empty() && n == 0)) {
8751       ss << "ec overwrites cannot be disabled once enabled";
8752       return -EINVAL;
8753     } else {
8754       ss << "expecting value 'true', 'false', '0', or '1'";
8755       return -EINVAL;
8756     }
8757   } else if (var == "target_max_objects") {
8758     if (interr.length()) {
8759       ss << "error parsing int '" << val << "': " << interr;
8760       return -EINVAL;
8761     }
8762     p.target_max_objects = n;
8763   } else if (var == "target_max_bytes") {
8764     if (interr.length()) {
8765       ss << "error parsing int '" << val << "': " << interr;
8766       return -EINVAL;
8767     }
8768     p.target_max_bytes = n;
8769   } else if (var == "cache_target_dirty_ratio") {
8770     if (floaterr.length()) {
8771       ss << "error parsing float '" << val << "': " << floaterr;
8772       return -EINVAL;
8773     }
8774     if (f < 0 || f > 1.0) {
8775       ss << "value must be in the range 0..1";
8776       return -ERANGE;
8777     }
8778     p.cache_target_dirty_ratio_micro = uf;
8779   } else if (var == "cache_target_dirty_high_ratio") {
8780     if (floaterr.length()) {
8781       ss << "error parsing float '" << val << "': " << floaterr;
8782       return -EINVAL;
8783     }
8784     if (f < 0 || f > 1.0) {
8785       ss << "value must be in the range 0..1";
8786       return -ERANGE;
8787     }
8788     p.cache_target_dirty_high_ratio_micro = uf;
8789   } else if (var == "cache_target_full_ratio") {
8790     if (floaterr.length()) {
8791       ss << "error parsing float '" << val << "': " << floaterr;
8792       return -EINVAL;
8793     }
8794     if (f < 0 || f > 1.0) {
8795       ss << "value must be in the range 0..1";
8796       return -ERANGE;
8797     }
8798     p.cache_target_full_ratio_micro = uf;
8799   } else if (var == "cache_min_flush_age") {
8800     if (interr.length()) {
8801       ss << "error parsing int '" << val << "': " << interr;
8802       return -EINVAL;
8803     }
8804     p.cache_min_flush_age = n;
8805   } else if (var == "cache_min_evict_age") {
8806     if (interr.length()) {
8807       ss << "error parsing int '" << val << "': " << interr;
8808       return -EINVAL;
8809     }
8810     p.cache_min_evict_age = n;
8811   } else if (var == "min_read_recency_for_promote") {
8812     if (interr.length()) {
8813       ss << "error parsing integer value '" << val << "': " << interr;
8814       return -EINVAL;
8815     }
8816     p.min_read_recency_for_promote = n;
8817   } else if (var == "hit_set_grade_decay_rate") {
8818     if (interr.length()) {
8819       ss << "error parsing integer value '" << val << "': " << interr;
8820       return -EINVAL;
8821     }
8822     if (n > 100 || n < 0) {
8823       ss << "value out of range,valid range is 0 - 100";
8824       return -EINVAL;
8825     }
8826     p.hit_set_grade_decay_rate = n;
8827   } else if (var == "hit_set_search_last_n") {
8828     if (interr.length()) {
8829       ss << "error parsing integer value '" << val << "': " << interr;
8830       return -EINVAL;
8831     }
8832     if (n > p.hit_set_count || n < 0) {
8833       ss << "value out of range,valid range is 0 - hit_set_count";
8834       return -EINVAL;
8835     }
8836     p.hit_set_search_last_n = n;
8837   } else if (var == "min_write_recency_for_promote") {
8838     if (interr.length()) {
8839       ss << "error parsing integer value '" << val << "': " << interr;
8840       return -EINVAL;
8841     }
8842     p.min_write_recency_for_promote = n;
8843   } else if (var == "fast_read") {
8844     if (p.is_replicated()) {
8845         ss << "fast read is not supported in replication pool";
8846         return -EINVAL;
8847     }
8848     if (val == "true" || (interr.empty() && n == 1)) {
8849       p.fast_read = true;
8850     } else if (val == "false" || (interr.empty() && n == 0)) {
8851       p.fast_read = false;
8852     } else {
8853       ss << "expecting value 'true', 'false', '0', or '1'";
8854       return -EINVAL;
8855     }
8856   } else if (pool_opts_t::is_opt_name(var)) {
8857     bool unset = val == "unset";
8858     if (var == "compression_mode") {
8859       if (!unset) {
8860         auto cmode = Compressor::get_comp_mode_type(val);
8861         if (!cmode) {
8862           ss << "unrecognized compression mode '" << val << "'";
8863           return -EINVAL;
8864         }
8865       }
8866     } else if (var == "compression_algorithm") {
8867       if (!unset) {
8868         auto alg = Compressor::get_comp_alg_type(val);
8869         if (!alg) {
8870           ss << "unrecognized compression_algorithm '" << val << "'";
8871           return -EINVAL;
8872         }
8873       }
8874     } else if (var == "compression_required_ratio") {
8875       if (floaterr.length()) {
8876         ss << "error parsing float value '" << val << "': " << floaterr;
8877         return -EINVAL;
8878       }
8879       if (f < 0 || f > 1) {
8880         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8881         return -EINVAL;
8882       }
8883     } else if (var == "csum_type") {
8884       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8885       if (t < 0 ) {
8886         ss << "unrecognized csum_type '" << val << "'";
8887         return -EINVAL;
8888       }
8889       //preserve csum_type numeric value
8890       n = t;
8891       interr.clear();
8892     } else if (var == "compression_max_blob_size" ||
8893                var == "compression_min_blob_size" ||
8894                var == "csum_max_block" ||
8895                var == "csum_min_block") {
8896       if (interr.length()) {
8897         ss << "error parsing int value '" << val << "': " << interr;
8898         return -EINVAL;
8899       }
8900     } else if (var == "fingerprint_algorithm") {
8901       if (!unset) {
8902         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8903         if (!alg) {
8904           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8905           return -EINVAL;
8906         }
8907       }
8908     } else if (var == "target_size_bytes") {
8909       if (interr.length()) {
8910         ss << "error parsing unit value '" << val << "': " << interr;
8911         return -EINVAL;
8912       }
8913       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8914         ss << "must set require_osd_release to nautilus or "
8915            << "later before setting target_size_bytes";
8916         return -EINVAL;
8917       }
8918     } else if (var == "target_size_ratio") {
8919       if (f < 0.0) {
8920         ss << "target_size_ratio cannot be negative";
8921         return -EINVAL;
8922       }
8923     } else if (var == "pg_num_min") {
8924       if (interr.length()) {
8925         ss << "error parsing int value '" << val << "': " << interr;
8926         return -EINVAL;
8927       }
8928       if (n > (int)p.get_pg_num_target()) {
8929         ss << "specified pg_num_min " << n
8930            << " > pg_num " << p.get_pg_num_target();
8931         return -EINVAL;
8932       }
8933     } else if (var == "pg_num_max") {
8934       if (interr.length()) {
8935         ss << "error parsing int value '" << val << "': " << interr;
8936         return -EINVAL;
8937       }
8938       if (n && n < (int)p.get_pg_num_target()) {
8939         ss << "specified pg_num_max " << n
8940            << " < pg_num " << p.get_pg_num_target();
8941         return -EINVAL;
8942       }
8943     } else if (var == "recovery_priority") {
8944       if (interr.length()) {
8945         ss << "error parsing int value '" << val << "': " << interr;
8946         return -EINVAL;
8947       }
8948       if (!g_conf()->debug_allow_any_pool_priority) {
8949         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8950           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8951              << " and " << OSD_POOL_PRIORITY_MAX;
8952           return -EINVAL;
8953         }
8954       }
8955     } else if (var == "pg_autoscale_bias") {
8956       if (f < 0.0 || f > 1000.0) {
8957         ss << "pg_autoscale_bias must be between 0 and 1000";
8958         return -EINVAL;
8959       }
8960     } else if (var == "dedup_tier") {
8961       if (interr.empty()) {
8962         ss << "expecting value 'pool name'";
8963         return -EINVAL;
8964       }
8965       // Current base tier in dedup does not support ec pool
8966       if (p.is_erasure()) {
8967         ss << "pool '" << poolstr
8968            << "' is an ec pool, which cannot be a base tier";
8969         return -ENOTSUP;
8970       }
8971       int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8972       if (lowtierpool_id < 0) {
8973         ss << "unrecognized pool '" << val << "'";
8974         return -ENOENT;
8975       }
8976       const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8977       ceph_assert(tp);
8978       n = lowtierpool_id;
8979       // The original input is string (pool name), but we convert it to int64_t.
8980       // So, clear interr
8981       interr.clear();
8982     } else if (var == "dedup_chunk_algorithm") {
8983       if (!unset) {
8984         auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8985         if (!alg) {
8986           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8987           return -EINVAL;
8988         }
8989       }
8990     } else if (var == "dedup_cdc_chunk_size") {
8991       if (interr.length()) {
8992         ss << "error parsing int value '" << val << "': " << interr;
8993         return -EINVAL;
8994       }
8995     }
8996
8997     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8998     switch (desc.type) {
8999     case pool_opts_t::STR:
9000       if (unset) {
9001         p.opts.unset(desc.key);
9002       } else {
9003         p.opts.set(desc.key, static_cast<std::string>(val));
9004       }
9005       break;
9006     case pool_opts_t::INT:
9007       if (interr.length()) {
9008         ss << "error parsing integer value '" << val << "': " << interr;
9009         return -EINVAL;
9010       }
9011       if (n == 0) {
9012         p.opts.unset(desc.key);
9013       } else {
9014         p.opts.set(desc.key, static_cast<int64_t>(n));
9015       }
9016       break;
9017     case pool_opts_t::DOUBLE:
9018       if (floaterr.length()) {
9019         ss << "error parsing floating point value '" << val << "': " << floaterr;
9020         return -EINVAL;
9021       }
9022       if (f == 0) {
9023         p.opts.unset(desc.key);
9024       } else {
9025         p.opts.set(desc.key, static_cast<double>(f));
9026       }
9027       break;
9028     default:
9029       ceph_assert(!"unknown type");
9030     }
9031   } else {
9032     ss << "unrecognized variable '" << var << "'";
9033     return -EINVAL;
9034   }
9035   if (val != "unset") {
9036     ss << "set pool " << pool << " " << var << " to " << val;
9037   } else {
9038     ss << "unset pool " << pool << " " << var;
9039   }
9040   p.last_change = pending_inc.epoch;
9041   pending_inc.new_pools[pool] = p;
9042   return 0;
9043 }
9044
9045 int OSDMonitor::prepare_command_pool_application(const string &prefix,
9046                                                  const cmdmap_t& cmdmap,
9047                                                  stringstream& ss)
9048 {
9049   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
9050 }
9051
9052 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
9053                                                     const cmdmap_t& cmdmap,
9054                                                     stringstream& ss,
9055                                                     bool *modified)
9056 {
9057   return _command_pool_application(prefix, cmdmap, ss, modified, false);
9058 }
9059
9060
9061 /**
9062  * Common logic for preprocess and prepare phases of pool application
9063  * tag commands.  In preprocess mode we're only detecting invalid
9064  * commands, and determining whether it was a modification or a no-op.
9065  * In prepare mode we're actually updating the pending state.
9066  */
9067 int OSDMonitor::_command_pool_application(const string &prefix,
9068                                           const cmdmap_t& cmdmap,
9069                                           stringstream& ss,
9070                                           bool *modified,
9071                                           bool preparing)
9072 {
9073   string pool_name;
9074   cmd_getval(cmdmap, "pool", pool_name);
9075   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9076   if (pool < 0) {
9077     ss << "unrecognized pool '" << pool_name << "'";
9078     return -ENOENT;
9079   }
9080
9081   pg_pool_t p = *osdmap.get_pg_pool(pool);
9082   if (preparing) {
9083     if (pending_inc.new_pools.count(pool)) {
9084       p = pending_inc.new_pools[pool];
9085     }
9086   }
9087
9088   string app;
9089   cmd_getval(cmdmap, "app", app);
9090   bool app_exists = (p.application_metadata.count(app) > 0);
9091
9092   string key;
9093   cmd_getval(cmdmap, "key", key);
9094   if (key == "all") {
9095     ss << "key cannot be 'all'";
9096     return -EINVAL;
9097   }
9098
9099   string value;
9100   cmd_getval(cmdmap, "value", value);
9101   if (value == "all") {
9102     ss << "value cannot be 'all'";
9103     return -EINVAL;
9104   }
9105
9106   if (boost::algorithm::ends_with(prefix, "enable")) {
9107     if (app.empty()) {
9108       ss << "application name must be provided";
9109       return -EINVAL;
9110     }
9111
9112     if (p.is_tier()) {
9113       ss << "application must be enabled on base tier";
9114       return -EINVAL;
9115     }
9116
9117     bool force = false;
9118     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9119
9120     if (!app_exists && !p.application_metadata.empty() && !force) {
9121       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9122          << "application; pass --yes-i-really-mean-it to proceed anyway";
9123       return -EPERM;
9124     }
9125
9126     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9127       ss << "too many enabled applications on pool '" << pool_name << "'; "
9128          << "max " << MAX_POOL_APPLICATIONS;
9129       return -EINVAL;
9130     }
9131
9132     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9133       ss << "application name '" << app << "' too long; max length "
9134          << MAX_POOL_APPLICATION_LENGTH;
9135       return -EINVAL;
9136     }
9137
9138     if (!app_exists) {
9139       p.application_metadata[app] = {};
9140     }
9141     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9142
9143   } else if (boost::algorithm::ends_with(prefix, "disable")) {
9144     bool force = false;
9145     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9146
9147     if (!force) {
9148       ss << "Are you SURE? Disabling an application within a pool might result "
9149          << "in loss of application functionality; pass "
9150          << "--yes-i-really-mean-it to proceed anyway";
9151       return -EPERM;
9152     }
9153
9154     if (!app_exists) {
9155       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9156          << "'";
9157       return 0; // idempotent
9158     }
9159
9160     p.application_metadata.erase(app);
9161     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9162
9163   } else if (boost::algorithm::ends_with(prefix, "set")) {
9164     if (p.is_tier()) {
9165       ss << "application metadata must be set on base tier";
9166       return -EINVAL;
9167     }
9168
9169     if (!app_exists) {
9170       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9171          << "'";
9172       return -ENOENT;
9173     }
9174
9175     string key;
9176     cmd_getval(cmdmap, "key", key);
9177
9178     if (key.empty()) {
9179       ss << "key must be provided";
9180       return -EINVAL;
9181     }
9182
9183     auto &app_keys = p.application_metadata[app];
9184     if (app_keys.count(key) == 0 &&
9185         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9186       ss << "too many keys set for application '" << app << "' on pool '"
9187          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9188       return -EINVAL;
9189     }
9190
9191     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9192       ss << "key '" << app << "' too long; max length "
9193          << MAX_POOL_APPLICATION_LENGTH;
9194       return -EINVAL;
9195     }
9196
9197     string value;
9198     cmd_getval(cmdmap, "value", value);
9199     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9200       ss << "value '" << value << "' too long; max length "
9201          << MAX_POOL_APPLICATION_LENGTH;
9202       return -EINVAL;
9203     }
9204
9205     p.application_metadata[app][key] = value;
9206     ss << "set application '" << app << "' key '" << key << "' to '"
9207        << value << "' on pool '" << pool_name << "'";
9208   } else if (boost::algorithm::ends_with(prefix, "rm")) {
9209     if (!app_exists) {
9210       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9211          << "'";
9212       return -ENOENT;
9213     }
9214
9215     string key;
9216     cmd_getval(cmdmap, "key", key);
9217     auto it = p.application_metadata[app].find(key);
9218     if (it == p.application_metadata[app].end()) {
9219       ss << "application '" << app << "' on pool '" << pool_name
9220          << "' does not have key '" << key << "'";
9221       return 0; // idempotent
9222     }
9223
9224     p.application_metadata[app].erase(it);
9225     ss << "removed application '" << app << "' key '" << key << "' on pool '"
9226        << pool_name << "'";
9227   } else {
9228     ceph_abort();
9229   }
9230
9231   if (preparing) {
9232     p.last_change = pending_inc.epoch;
9233     pending_inc.new_pools[pool] = p;
9234   }
9235
9236   // Because we fell through this far, we didn't hit no-op cases,
9237   // so pool was definitely modified
9238   if (modified != nullptr) {
9239     *modified = true;
9240   }
9241
9242   return 0;
9243 }
9244
9245 int OSDMonitor::_prepare_command_osd_crush_remove(
9246     CrushWrapper &newcrush,
9247     int32_t id,
9248     int32_t ancestor,
9249     bool has_ancestor,
9250     bool unlink_only)
9251 {
9252   int err = 0;
9253
9254   if (has_ancestor) {
9255     err = newcrush.remove_item_under(cct, id, ancestor,
9256         unlink_only);
9257   } else {
9258     err = newcrush.remove_item(cct, id, unlink_only);
9259   }
9260   return err;
9261 }
9262
9263 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9264 {
9265   pending_inc.crush.clear();
9266   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9267 }
9268
9269 int OSDMonitor::prepare_command_osd_crush_remove(
9270     CrushWrapper &newcrush,
9271     int32_t id,
9272     int32_t ancestor,
9273     bool has_ancestor,
9274     bool unlink_only)
9275 {
9276   int err = _prepare_command_osd_crush_remove(
9277       newcrush, id, ancestor,
9278       has_ancestor, unlink_only);
9279
9280   if (err < 0)
9281     return err;
9282
9283   ceph_assert(err == 0);
9284   do_osd_crush_remove(newcrush);
9285
9286   return 0;
9287 }
9288
9289 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9290 {
9291   if (osdmap.is_up(id)) {
9292     return -EBUSY;
9293   }
9294
9295   pending_inc.new_state[id] = osdmap.get_state(id);
9296   pending_inc.new_uuid[id] = uuid_d();
9297   pending_metadata_rm.insert(id);
9298   pending_metadata.erase(id);
9299
9300   return 0;
9301 }
9302
9303 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9304 {
9305   ceph_assert(existing_id);
9306   *existing_id = -1;
9307
9308   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9309     if (!osdmap.exists(i) &&
9310         pending_inc.new_up_client.count(i) == 0 &&
9311         (pending_inc.new_state.count(i) == 0 ||
9312          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9313       *existing_id = i;
9314       return -1;
9315     }
9316   }
9317
9318   if (pending_inc.new_max_osd < 0) {
9319     return osdmap.get_max_osd();
9320   }
9321   return pending_inc.new_max_osd;
9322 }
9323
9324 void OSDMonitor::do_osd_create(
9325     const int32_t id,
9326     const uuid_d& uuid,
9327     const string& device_class,
9328     int32_t* new_id)
9329 {
9330   dout(10) << __func__ << " uuid " << uuid << dendl;
9331   ceph_assert(new_id);
9332
9333   // We presume validation has been performed prior to calling this
9334   // function. We assert with prejudice.
9335
9336   int32_t allocated_id = -1; // declare here so we can jump
9337   int32_t existing_id = -1;
9338   if (!uuid.is_zero()) {
9339     existing_id = osdmap.identify_osd(uuid);
9340     if (existing_id >= 0) {
9341       ceph_assert(id < 0 || id == existing_id);
9342       *new_id = existing_id;
9343       goto out;
9344     } else if (id >= 0) {
9345       // uuid does not exist, and id has been provided, so just create
9346       // the new osd.id
9347       *new_id = id;
9348       goto out;
9349     }
9350   }
9351
9352   // allocate a new id
9353   allocated_id = _allocate_osd_id(&existing_id);
9354   dout(10) << __func__ << " allocated id " << allocated_id
9355            << " existing id " << existing_id << dendl;
9356   if (existing_id >= 0) {
9357     ceph_assert(existing_id < osdmap.get_max_osd());
9358     ceph_assert(allocated_id < 0);
9359     *new_id = existing_id;
9360   } else if (allocated_id >= 0) {
9361     ceph_assert(existing_id < 0);
9362     // raise max_osd
9363     if (pending_inc.new_max_osd < 0) {
9364       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9365     } else {
9366       ++pending_inc.new_max_osd;
9367     }
9368     *new_id = pending_inc.new_max_osd - 1;
9369     ceph_assert(*new_id == allocated_id);
9370   } else {
9371     ceph_abort_msg("unexpected condition");
9372   }
9373
9374 out:
9375   if (device_class.size()) {
9376     CrushWrapper newcrush = _get_pending_crush();
9377     if (newcrush.get_max_devices() < *new_id + 1) {
9378       newcrush.set_max_devices(*new_id + 1);
9379     }
9380     string name = string("osd.") + stringify(*new_id);
9381     if (!newcrush.item_exists(*new_id)) {
9382       newcrush.set_item_name(*new_id, name);
9383     }
9384     ostringstream ss;
9385     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9386     if (r < 0) {
9387       derr << __func__ << " failed to set " << name << " device_class "
9388            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9389            << dendl;
9390       // non-fatal... this might be a replay and we want to be idempotent.
9391     } else {
9392       dout(20) << __func__ << " set " << name << " device_class " << device_class
9393                << dendl;
9394       pending_inc.crush.clear();
9395       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9396     }
9397   } else {
9398     dout(20) << __func__ << " no device_class" << dendl;
9399   }
9400
9401   dout(10) << __func__ << " using id " << *new_id << dendl;
9402   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9403     pending_inc.new_max_osd = *new_id + 1;
9404   }
9405
9406   pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9407   // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9408   // set it for us.  (ugh.)
9409   pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9410   if (!uuid.is_zero())
9411     pending_inc.new_uuid[*new_id] = uuid;
9412 }
9413
9414 int OSDMonitor::validate_osd_create(
9415     const int32_t id,
9416     const uuid_d& uuid,
9417     const bool check_osd_exists,
9418     int32_t* existing_id,
9419     stringstream& ss)
9420 {
9421
9422   dout(10) << __func__ << " id " << id << " uuid " << uuid
9423            << " check_osd_exists " << check_osd_exists << dendl;
9424
9425   ceph_assert(existing_id);
9426
9427   if (id < 0 && uuid.is_zero()) {
9428     // we have nothing to validate
9429     *existing_id = -1;
9430     return 0;
9431   } else if (uuid.is_zero()) {
9432     // we have an id but we will ignore it - because that's what
9433     // `osd create` does.
9434     return 0;
9435   }
9436
9437   /*
9438    * This function will be used to validate whether we are able to
9439    * create a new osd when the `uuid` is specified.
9440    *
9441    * It will be used by both `osd create` and `osd new`, as the checks
9442    * are basically the same when it pertains to osd id and uuid validation.
9443    * However, `osd create` presumes an `uuid` is optional, for legacy
9444    * reasons, while `osd new` requires the `uuid` to be provided. This
9445    * means that `osd create` will not be idempotent if an `uuid` is not
9446    * provided, but we will always guarantee the idempotency of `osd new`.
9447    */
9448
9449   ceph_assert(!uuid.is_zero());
9450   if (pending_inc.identify_osd(uuid) >= 0) {
9451     // osd is about to exist
9452     return -EAGAIN;
9453   }
9454
9455   int32_t i = osdmap.identify_osd(uuid);
9456   if (i >= 0) {
9457     // osd already exists
9458     if (id >= 0 && i != id) {
9459       ss << "uuid " << uuid << " already in use for different id " << i;
9460       return -EEXIST;
9461     }
9462     // return a positive errno to distinguish between a blocking error
9463     // and an error we consider to not be a problem (i.e., this would be
9464     // an idempotent operation).
9465     *existing_id = i;
9466     return EEXIST;
9467   }
9468   // i < 0
9469   if (id >= 0) {
9470     if (pending_inc.new_state.count(id)) {
9471       // osd is about to exist
9472       return -EAGAIN;
9473     }
9474     // we may not care if an osd exists if we are recreating a previously
9475     // destroyed osd.
9476     if (check_osd_exists && osdmap.exists(id)) {
9477       ss << "id " << id << " already in use and does not match uuid "
9478          << uuid;
9479       return -EINVAL;
9480     }
9481   }
9482   return 0;
9483 }
9484
9485 int OSDMonitor::prepare_command_osd_create(
9486     const int32_t id,
9487     const uuid_d& uuid,
9488     int32_t* existing_id,
9489     stringstream& ss)
9490 {
9491   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9492   ceph_assert(existing_id);
9493   if (osdmap.is_destroyed(id)) {
9494     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9495           "instead.";
9496     return -EINVAL;
9497   }
9498
9499   if (uuid.is_zero()) {
9500     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9501   }
9502
9503   return validate_osd_create(id, uuid, true, existing_id, ss);
9504 }
9505
9506 int OSDMonitor::prepare_command_osd_new(
9507     MonOpRequestRef op,
9508     const cmdmap_t& cmdmap,
9509     const map<string,string>& params,
9510     stringstream &ss,
9511     Formatter *f)
9512 {
9513   uuid_d uuid;
9514   string uuidstr;
9515   int64_t id = -1;
9516
9517   ceph_assert(paxos.is_plugged());
9518
9519   dout(10) << __func__ << " " << op << dendl;
9520
9521   /* validate command. abort now if something's wrong. */
9522
9523   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9524    *
9525    * If `id` is not specified, we will identify any existing osd based
9526    * on `uuid`. Operation will be idempotent iff secrets match.
9527    *
9528    * If `id` is specified, we will identify any existing osd based on
9529    * `uuid` and match against `id`. If they match, operation will be
9530    * idempotent iff secrets match.
9531    *
9532    * `-i secrets.json` will be optional. If supplied, will be used
9533    * to check for idempotency when `id` and `uuid` match.
9534    *
9535    * If `id` is not specified, and `uuid` does not exist, an id will
9536    * be found or allocated for the osd.
9537    *
9538    * If `id` is specified, and the osd has been previously marked
9539    * as destroyed, then the `id` will be reused.
9540    */
9541   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9542     ss << "requires the OSD's UUID to be specified.";
9543     return -EINVAL;
9544   } else if (!uuid.parse(uuidstr.c_str())) {
9545     ss << "invalid UUID value '" << uuidstr << "'.";
9546     return -EINVAL;
9547   }
9548
9549   if (cmd_getval(cmdmap, "id", id) &&
9550       (id < 0)) {
9551     ss << "invalid OSD id; must be greater or equal than zero.";
9552     return -EINVAL;
9553   }
9554
9555   // are we running an `osd create`-like command, or recreating
9556   // a previously destroyed osd?
9557
9558   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9559
9560   // we will care about `id` to assess whether osd is `destroyed`, or
9561   // to create a new osd.
9562   // we will need an `id` by the time we reach auth.
9563
9564   int32_t existing_id = -1;
9565   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9566                                 &existing_id, ss);
9567
9568   bool may_be_idempotent = false;
9569   if (err == EEXIST) {
9570     // this is idempotent from the osdmon's point-of-view
9571     may_be_idempotent = true;
9572     ceph_assert(existing_id >= 0);
9573     id = existing_id;
9574   } else if (err < 0) {
9575     return err;
9576   }
9577
9578   if (!may_be_idempotent) {
9579     // idempotency is out of the window. We are either creating a new
9580     // osd or recreating a destroyed osd.
9581     //
9582     // We now need to figure out if we have an `id` (and if it's valid),
9583     // of find an `id` if we don't have one.
9584
9585     // NOTE: we need to consider the case where the `id` is specified for
9586     // `osd create`, and we must honor it. So this means checking if
9587     // the `id` is destroyed, and if so assume the destroy; otherwise,
9588     // check if it `exists` - in which case we complain about not being
9589     // `destroyed`. In the end, if nothing fails, we must allow the
9590     // creation, so that we are compatible with `create`.
9591     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9592       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9593       ss << "OSD " << id << " has not yet been destroyed";
9594       return -EINVAL;
9595     } else if (id < 0) {
9596       // find an `id`
9597       id = _allocate_osd_id(&existing_id);
9598       if (id < 0) {
9599         ceph_assert(existing_id >= 0);
9600         id = existing_id;
9601       }
9602       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9603     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9604       dout(10) << __func__ << " recreating osd." << id << dendl;
9605     } else {
9606       dout(10) << __func__ << " creating new osd." << id << dendl;
9607     }
9608   } else {
9609     ceph_assert(id >= 0);
9610     ceph_assert(osdmap.exists(id));
9611   }
9612
9613   // we are now able to either create a brand new osd or reuse an existing
9614   // osd that has been previously destroyed.
9615
9616   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9617
9618   if (may_be_idempotent && params.empty()) {
9619     // nothing to do, really.
9620     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9621     ceph_assert(id >= 0);
9622     if (f) {
9623       f->open_object_section("created_osd");
9624       f->dump_int("osdid", id);
9625       f->close_section();
9626     } else {
9627       ss << id;
9628     }
9629     return EEXIST;
9630   }
9631
9632   string device_class;
9633   auto p = params.find("crush_device_class");
9634   if (p != params.end()) {
9635     device_class = p->second;
9636     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9637   }
9638   string cephx_secret, lockbox_secret, dmcrypt_key;
9639   bool has_lockbox = false;
9640   bool has_secrets = params.count("cephx_secret")
9641     || params.count("cephx_lockbox_secret")
9642     || params.count("dmcrypt_key");
9643
9644   KVMonitor *svc = nullptr;
9645   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9646
9647   if (has_secrets) {
9648     if (params.count("cephx_secret") == 0) {
9649       ss << "requires a cephx secret.";
9650       return -EINVAL;
9651     }
9652     cephx_secret = params.at("cephx_secret");
9653
9654     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9655     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9656
9657     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9658              << " dmcrypt " << has_dmcrypt_key << dendl;
9659
9660     if (has_lockbox_secret && has_dmcrypt_key) {
9661       has_lockbox = true;
9662       lockbox_secret = params.at("cephx_lockbox_secret");
9663       dmcrypt_key = params.at("dmcrypt_key");
9664     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9665       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9666       return -EINVAL;
9667     }
9668
9669     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9670
9671     err = mon.authmon()->validate_osd_new(id, uuid,
9672         cephx_secret,
9673         lockbox_secret,
9674         cephx_entity,
9675         lockbox_entity,
9676         ss);
9677     if (err < 0) {
9678       return err;
9679     } else if (may_be_idempotent && err != EEXIST) {
9680       // for this to be idempotent, `id` should already be >= 0; no need
9681       // to use validate_id.
9682       ceph_assert(id >= 0);
9683       ss << "osd." << id << " exists but secrets do not match";
9684       return -EEXIST;
9685     }
9686
9687     if (has_lockbox) {
9688       svc = mon.kvmon();
9689       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9690       if (err < 0) {
9691         return err;
9692       } else if (may_be_idempotent && err != EEXIST) {
9693         ceph_assert(id >= 0);
9694         ss << "osd." << id << " exists but dm-crypt key does not match.";
9695         return -EEXIST;
9696       }
9697     }
9698   }
9699   ceph_assert(!has_secrets || !cephx_secret.empty());
9700   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9701
9702   if (may_be_idempotent) {
9703     // we have nothing to do for either the osdmon or the authmon,
9704     // and we have no lockbox - so the config key service will not be
9705     // touched. This is therefore an idempotent operation, and we can
9706     // just return right away.
9707     dout(10) << __func__ << " idempotent -- no op." << dendl;
9708     ceph_assert(id >= 0);
9709     if (f) {
9710       f->open_object_section("created_osd");
9711       f->dump_int("osdid", id);
9712       f->close_section();
9713     } else {
9714       ss << id;
9715     }
9716     return EEXIST;
9717   }
9718   ceph_assert(!may_be_idempotent);
9719
9720   // perform updates.
9721   if (has_secrets) {
9722     ceph_assert(!cephx_secret.empty());
9723     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9724            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9725
9726     err = mon.authmon()->do_osd_new(cephx_entity,
9727         lockbox_entity,
9728         has_lockbox);
9729     ceph_assert(0 == err);
9730
9731     if (has_lockbox) {
9732       ceph_assert(nullptr != svc);
9733       svc->do_osd_new(uuid, dmcrypt_key);
9734     }
9735   }
9736
9737   if (is_recreate_destroyed) {
9738     ceph_assert(id >= 0);
9739     ceph_assert(osdmap.is_destroyed(id));
9740     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9741     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9742       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9743     }
9744     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9745       // due to http://tracker.ceph.com/issues/20751 some clusters may
9746       // have UP set for non-existent OSDs; make sure it is cleared
9747       // for a newly created osd.
9748       pending_inc.new_state[id] |= CEPH_OSD_UP;
9749     }
9750     pending_inc.new_uuid[id] = uuid;
9751   } else {
9752     ceph_assert(id >= 0);
9753     int32_t new_id = -1;
9754     do_osd_create(id, uuid, device_class, &new_id);
9755     ceph_assert(new_id >= 0);
9756     ceph_assert(id == new_id);
9757   }
9758
9759   if (f) {
9760     f->open_object_section("created_osd");
9761     f->dump_int("osdid", id);
9762     f->close_section();
9763   } else {
9764     ss << id;
9765   }
9766
9767   return 0;
9768 }
9769
9770 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9771 {
9772   op->mark_osdmon_event(__func__);
9773   auto m = op->get_req<MMonCommand>();
9774   stringstream ss;
9775   cmdmap_t cmdmap;
9776   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9777     string rs = ss.str();
9778     mon.reply_command(op, -EINVAL, rs, get_last_committed());
9779     return false; /* nothing to propose */
9780   }
9781
9782   MonSession *session = op->get_session();
9783   if (!session) {
9784     derr << __func__ << " no session" << dendl;
9785     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9786     return false; /* nothing to propose */
9787   }
9788
9789   return prepare_command_impl(op, cmdmap);
9790 }
9791
9792 static int parse_reweights(CephContext *cct,
9793                            const cmdmap_t& cmdmap,
9794                            const OSDMap& osdmap,
9795                            map<int32_t, uint32_t>* weights)
9796 {
9797   string weights_str;
9798   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9799     return -EINVAL;
9800   }
9801   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9802   json_spirit::mValue json_value;
9803   if (!json_spirit::read(weights_str, json_value)) {
9804     return -EINVAL;
9805   }
9806   if (json_value.type() != json_spirit::obj_type) {
9807     return -EINVAL;
9808   }
9809   const auto obj = json_value.get_obj();
9810   try {
9811     for (auto& osd_weight : obj) {
9812       auto osd_id = std::stoi(osd_weight.first);
9813       if (!osdmap.exists(osd_id)) {
9814         return -ENOENT;
9815       }
9816       if (osd_weight.second.type() != json_spirit::str_type) {
9817         return -EINVAL;
9818       }
9819       auto weight = std::stoul(osd_weight.second.get_str());
9820       weights->insert({osd_id, weight});
9821     }
9822   } catch (const std::logic_error& e) {
9823     return -EINVAL;
9824   }
9825   return 0;
9826 }
9827
9828 int OSDMonitor::prepare_command_osd_destroy(
9829     int32_t id,
9830     stringstream& ss)
9831 {
9832   ceph_assert(paxos.is_plugged());
9833
9834   // we check if the osd exists for the benefit of `osd purge`, which may
9835   // have previously removed the osd. If the osd does not exist, return
9836   // -ENOENT to convey this, and let the caller deal with it.
9837   //
9838   // we presume that all auth secrets and config keys were removed prior
9839   // to this command being called. if they exist by now, we also assume
9840   // they must have been created by some other command and do not pertain
9841   // to this non-existent osd.
9842   if (!osdmap.exists(id)) {
9843     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9844     return -ENOENT;
9845   }
9846
9847   uuid_d uuid = osdmap.get_uuid(id);
9848   dout(10) << __func__ << " destroying osd." << id
9849            << " uuid " << uuid << dendl;
9850
9851   // if it has been destroyed, we assume our work here is done.
9852   if (osdmap.is_destroyed(id)) {
9853     ss << "destroyed osd." << id;
9854     return 0;
9855   }
9856
9857   EntityName cephx_entity, lockbox_entity;
9858   bool idempotent_auth = false, idempotent_cks = false;
9859
9860   int err = mon.authmon()->validate_osd_destroy(id, uuid,
9861                                                  cephx_entity,
9862                                                  lockbox_entity,
9863                                                  ss);
9864   if (err < 0) {
9865     if (err == -ENOENT) {
9866       idempotent_auth = true;
9867     } else {
9868       return err;
9869     }
9870   }
9871
9872   auto svc = mon.kvmon();
9873   err = svc->validate_osd_destroy(id, uuid);
9874   if (err < 0) {
9875     ceph_assert(err == -ENOENT);
9876     err = 0;
9877     idempotent_cks = true;
9878   }
9879
9880   if (!idempotent_auth) {
9881     err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9882     ceph_assert(0 == err);
9883   }
9884
9885   if (!idempotent_cks) {
9886     svc->do_osd_destroy(id, uuid);
9887   }
9888
9889   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9890   pending_inc.new_uuid[id] = uuid_d();
9891
9892   // we can only propose_pending() once per service, otherwise we'll be
9893   // defying PaxosService and all laws of nature. Therefore, as we may
9894   // be used during 'osd purge', let's keep the caller responsible for
9895   // proposing.
9896   ceph_assert(err == 0);
9897   return 0;
9898 }
9899
9900 int OSDMonitor::prepare_command_osd_purge(
9901     int32_t id,
9902     stringstream& ss)
9903 {
9904   ceph_assert(paxos.is_plugged());
9905   dout(10) << __func__ << " purging osd." << id << dendl;
9906
9907   ceph_assert(!osdmap.is_up(id));
9908
9909   /*
9910    * This may look a bit weird, but this is what's going to happen:
9911    *
9912    *  1. we make sure that removing from crush works
9913    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9914    *     error, then we abort the whole operation, as no updates
9915    *     have been made. However, we this function will have
9916    *     side-effects, thus we need to make sure that all operations
9917    *     performed henceforth will *always* succeed.
9918    *  3. we call `prepare_command_osd_remove()`. Although this
9919    *     function can return an error, it currently only checks if the
9920    *     osd is up - and we have made sure that it is not so, so there
9921    *     is no conflict, and it is effectively an update.
9922    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9923    *     the crush update we delayed from before.
9924    */
9925
9926   CrushWrapper newcrush = _get_pending_crush();
9927
9928   bool may_be_idempotent = false;
9929
9930   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9931   if (err == -ENOENT) {
9932     err = 0;
9933     may_be_idempotent = true;
9934   } else if (err < 0) {
9935     ss << "error removing osd." << id << " from crush";
9936     return err;
9937   }
9938
9939   // no point destroying the osd again if it has already been marked destroyed
9940   if (!osdmap.is_destroyed(id)) {
9941     err = prepare_command_osd_destroy(id, ss);
9942     if (err < 0) {
9943       if (err == -ENOENT) {
9944         err = 0;
9945       } else {
9946         return err;
9947       }
9948     } else {
9949       may_be_idempotent = false;
9950     }
9951   }
9952   ceph_assert(0 == err);
9953
9954   if (may_be_idempotent && !osdmap.exists(id)) {
9955     dout(10) << __func__ << " osd." << id << " does not exist and "
9956              << "we are idempotent." << dendl;
9957     return -ENOENT;
9958   }
9959
9960   err = prepare_command_osd_remove(id);
9961   // we should not be busy, as we should have made sure this id is not up.
9962   ceph_assert(0 == err);
9963
9964   do_osd_crush_remove(newcrush);
9965   return 0;
9966 }
9967
9968 int OSDMonitor::parse_pgid(const cmdmap_t& cmdmap, stringstream &ss,
9969                            /* out */ pg_t &pgid, std::optional<string> pgids) {
9970   string pgidstr;
9971   if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
9972     ss << "unable to parse 'pgid' value '"
9973        << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
9974     return -EINVAL;
9975   }
9976   if (!pgid.parse(pgidstr.c_str())) {
9977     ss << "invalid pgid '" << pgidstr << "'";
9978     return -EINVAL;
9979   }
9980   if (!osdmap.pg_exists(pgid)) {
9981     ss << "pgid '" << pgid << "' does not exist";
9982     return -ENOENT;
9983   }
9984   if (pgids.has_value())
9985     pgids.value() = pgidstr;
9986   return 0;
9987 }
9988
9989 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9990                                       const cmdmap_t& cmdmap)
9991 {
9992   op->mark_osdmon_event(__func__);
9993   auto m = op->get_req<MMonCommand>();
9994   stringstream ss;
9995   string rs;
9996   bufferlist rdata;
9997   int err = 0;
9998
9999   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
10000   boost::scoped_ptr<Formatter> f(Formatter::create(format));
10001
10002   string prefix;
10003   cmd_getval(cmdmap, "prefix", prefix);
10004
10005   int64_t osdid;
10006   string osd_name;
10007   bool osdid_present = false;
10008   if (prefix != "osd pg-temp" &&
10009       prefix != "osd pg-upmap" &&
10010       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
10011     osdid_present = cmd_getval(cmdmap, "id", osdid);
10012   }
10013   if (osdid_present) {
10014     ostringstream oss;
10015     oss << "osd." << osdid;
10016     osd_name = oss.str();
10017   }
10018
10019   // Even if there's a pending state with changes that could affect
10020   // a command, considering that said state isn't yet committed, we
10021   // just don't care about those changes if the command currently being
10022   // handled acts as a no-op against the current committed state.
10023   // In a nutshell, we assume this command  happens *before*.
10024   //
10025   // Let me make this clearer:
10026   //
10027   //   - If we have only one client, and that client issues some
10028   //     operation that would conflict with this operation  but is
10029   //     still on the pending state, then we would be sure that said
10030   //     operation wouldn't have returned yet, so the client wouldn't
10031   //     issue this operation (unless the client didn't wait for the
10032   //     operation to finish, and that would be the client's own fault).
10033   //
10034   //   - If we have more than one client, each client will observe
10035   //     whatever is the state at the moment of the commit.  So, if we
10036   //     have two clients, one issuing an unlink and another issuing a
10037   //     link, and if the link happens while the unlink is still on the
10038   //     pending state, from the link's point-of-view this is a no-op.
10039   //     If different clients are issuing conflicting operations and
10040   //     they care about that, then the clients should make sure they
10041   //     enforce some kind of concurrency mechanism -- from our
10042   //     perspective that's what Douglas Adams would call an SEP.
10043   //
10044   // This should be used as a general guideline for most commands handled
10045   // in this function.  Adapt as you see fit, but please bear in mind that
10046   // this is the expected behavior.
10047
10048
10049   if (prefix == "osd setcrushmap" ||
10050       (prefix == "osd crush set" && !osdid_present)) {
10051     if (pending_inc.crush.length()) {
10052       dout(10) << __func__ << " waiting for pending crush update " << dendl;
10053       goto wait;
10054     }
10055     dout(10) << "prepare_command setting new crush map" << dendl;
10056     bufferlist data(m->get_data());
10057     CrushWrapper crush;
10058     try {
10059       auto bl = data.cbegin();
10060       crush.decode(bl);
10061     }
10062     catch (const std::exception &e) {
10063       err = -EINVAL;
10064       ss << "Failed to parse crushmap: " << e.what();
10065       goto reply_no_propose;
10066     }
10067
10068     int64_t prior_version = 0;
10069     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
10070       if (prior_version == osdmap.get_crush_version() - 1) {
10071         // see if we are a resend of the last update.  this is imperfect
10072         // (multiple racing updaters may not both get reliable success)
10073         // but we expect crush updaters (via this interface) to be rare-ish.
10074         bufferlist current, proposed;
10075         osdmap.crush->encode(current, mon.get_quorum_con_features());
10076         crush.encode(proposed, mon.get_quorum_con_features());
10077         if (current.contents_equal(proposed)) {
10078           dout(10) << __func__
10079                    << " proposed matches current and version equals previous"
10080                    << dendl;
10081           err = 0;
10082           ss << osdmap.get_crush_version();
10083           goto reply_no_propose;
10084         }
10085       }
10086       if (prior_version != osdmap.get_crush_version()) {
10087         err = -EPERM;
10088         ss << "prior_version " << prior_version << " != crush version "
10089            << osdmap.get_crush_version();
10090         goto reply_no_propose;
10091       }
10092     }
10093
10094     if (!validate_crush_against_features(&crush, ss)) {
10095       err = -EINVAL;
10096       goto reply_no_propose;
10097     }
10098
10099     err = osdmap.validate_crush_rules(&crush, &ss);
10100     if (err < 0) {
10101       goto reply_no_propose;
10102     }
10103
10104     if (g_conf()->mon_osd_crush_smoke_test) {
10105       // sanity check: test some inputs to make sure this map isn't
10106       // totally broken
10107       dout(10) << " testing map" << dendl;
10108       stringstream ess;
10109       CrushTester tester(crush, ess);
10110       tester.set_min_x(0);
10111       tester.set_max_x(50);
10112       tester.set_num_rep(3);  // arbitrary
10113       auto start = ceph::coarse_mono_clock::now();
10114       int r = tester.test_with_fork(cct, g_conf()->mon_lease);
10115       auto duration = ceph::coarse_mono_clock::now() - start;
10116       if (r < 0) {
10117         dout(10) << " tester.test_with_fork returns " << r
10118                  << ": " << ess.str() << dendl;
10119         ss << "crush smoke test failed with " << r << ": " << ess.str();
10120         err = r;
10121         goto reply_no_propose;
10122       }
10123       dout(10) << __func__ << " crush somke test duration: "
10124                << duration << ", result: " << ess.str() << dendl;
10125     }
10126
10127     pending_inc.crush = data;
10128     ss << osdmap.get_crush_version() + 1;
10129     goto update;
10130
10131   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10132     CrushWrapper newcrush = _get_pending_crush();
10133     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10134       int bid = -1 - b;
10135       if (newcrush.bucket_exists(bid) &&
10136           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10137         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10138         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10139       }
10140     }
10141     if (!validate_crush_against_features(&newcrush, ss)) {
10142       err = -EINVAL;
10143       goto reply_no_propose;
10144     }
10145     pending_inc.crush.clear();
10146     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10147     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10148                                               get_last_committed() + 1));
10149     return true;
10150   } else if (prefix == "osd crush set-device-class") {
10151     string device_class;
10152     if (!cmd_getval(cmdmap, "class", device_class)) {
10153       err = -EINVAL; // no value!
10154       goto reply_no_propose;
10155     }
10156
10157     bool stop = false;
10158     vector<string> idvec;
10159     cmd_getval(cmdmap, "ids", idvec);
10160     CrushWrapper newcrush = _get_pending_crush();
10161     set<int> updated;
10162     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10163       set<int> osds;
10164       // wildcard?
10165       if (j == 0 &&
10166           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10167         osdmap.get_all_osds(osds);
10168         stop = true;
10169       } else {
10170         // try traditional single osd way
10171         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10172         if (osd < 0) {
10173           // ss has reason for failure
10174           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10175           err = -EINVAL;
10176           continue;
10177         }
10178         osds.insert(osd);
10179       }
10180
10181       for (auto &osd : osds) {
10182         if (!osdmap.exists(osd)) {
10183           ss << "osd." << osd << " does not exist. ";
10184           continue;
10185         }
10186
10187         ostringstream oss;
10188         oss << "osd." << osd;
10189         string name = oss.str();
10190
10191         if (newcrush.get_max_devices() < osd + 1) {
10192           newcrush.set_max_devices(osd + 1);
10193         }
10194         string action;
10195         if (newcrush.item_exists(osd)) {
10196           action = "updating";
10197         } else {
10198           action = "creating";
10199           newcrush.set_item_name(osd, name);
10200         }
10201
10202         dout(5) << action << " crush item id " << osd << " name '" << name
10203                 << "' device_class '" << device_class << "'"
10204                 << dendl;
10205         err = newcrush.update_device_class(osd, device_class, name, &ss);
10206         if (err < 0) {
10207           goto reply_no_propose;
10208         }
10209         if (err == 0 && !_have_pending_crush()) {
10210           if (!stop) {
10211             // for single osd only, wildcard makes too much noise
10212             ss << "set-device-class item id " << osd << " name '" << name
10213                << "' device_class '" << device_class << "': no change. ";
10214           }
10215         } else {
10216           updated.insert(osd);
10217         }
10218       }
10219     }
10220
10221     pending_inc.crush.clear();
10222     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10223     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10224     getline(ss, rs);
10225     wait_for_finished_proposal(
10226       op,
10227       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10228     return true;
10229  } else if (prefix == "osd crush rm-device-class") {
10230     bool stop = false;
10231     vector<string> idvec;
10232     cmd_getval(cmdmap, "ids", idvec);
10233     CrushWrapper newcrush = _get_pending_crush();
10234     set<int> updated;
10235
10236     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10237       set<int> osds;
10238
10239       // wildcard?
10240       if (j == 0 &&
10241           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10242         osdmap.get_all_osds(osds);
10243         stop = true;
10244       } else {
10245         // try traditional single osd way
10246         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10247         if (osd < 0) {
10248           // ss has reason for failure
10249           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10250           err = -EINVAL;
10251           goto reply_no_propose;
10252         }
10253         osds.insert(osd);
10254       }
10255
10256       for (auto &osd : osds) {
10257         if (!osdmap.exists(osd)) {
10258           ss << "osd." << osd << " does not exist. ";
10259           continue;
10260         }
10261
10262         auto class_name = newcrush.get_item_class(osd);
10263         if (!class_name) {
10264           ss << "osd." << osd << " belongs to no class, ";
10265           continue;
10266         }
10267         // note that we do not verify if class_is_in_use here
10268         // in case the device is misclassified and user wants
10269         // to overridely reset...
10270
10271         err = newcrush.remove_device_class(cct, osd, &ss);
10272         if (err < 0) {
10273           // ss has reason for failure
10274           goto reply_no_propose;
10275         }
10276         updated.insert(osd);
10277       }
10278     }
10279
10280     pending_inc.crush.clear();
10281     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10282     ss << "done removing class of osd(s): " << updated;
10283     getline(ss, rs);
10284     wait_for_finished_proposal(
10285       op,
10286       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10287     return true;
10288   } else if (prefix == "osd crush class create") {
10289     string device_class;
10290     if (!cmd_getval(cmdmap, "class", device_class)) {
10291       err = -EINVAL; // no value!
10292       goto reply_no_propose;
10293     }
10294     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10295       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10296          << "luminous' before using crush device classes";
10297       err = -EPERM;
10298       goto reply_no_propose;
10299     }
10300     if (!_have_pending_crush() &&
10301         _get_stable_crush().class_exists(device_class)) {
10302       ss << "class '" << device_class << "' already exists";
10303       goto reply_no_propose;
10304     }
10305      CrushWrapper newcrush = _get_pending_crush();
10306      if (newcrush.class_exists(device_class)) {
10307       ss << "class '" << device_class << "' already exists";
10308       goto update;
10309     }
10310     int class_id = newcrush.get_or_create_class_id(device_class);
10311     pending_inc.crush.clear();
10312     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10313     ss << "created class " << device_class << " with id " << class_id
10314        << " to crush map";
10315     goto update;
10316   } else if (prefix == "osd crush class rm") {
10317     string device_class;
10318     if (!cmd_getval(cmdmap, "class", device_class)) {
10319        err = -EINVAL; // no value!
10320        goto reply_no_propose;
10321      }
10322     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10323        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10324          << "luminous' before using crush device classes";
10325        err = -EPERM;
10326        goto reply_no_propose;
10327      }
10328
10329      if (!osdmap.crush->class_exists(device_class)) {
10330        err = 0;
10331        goto reply_no_propose;
10332      }
10333
10334      CrushWrapper newcrush = _get_pending_crush();
10335      if (!newcrush.class_exists(device_class)) {
10336        err = 0; // make command idempotent
10337        goto wait;
10338      }
10339      int class_id = newcrush.get_class_id(device_class);
10340      stringstream ts;
10341      if (newcrush.class_is_in_use(class_id, &ts)) {
10342        err = -EBUSY;
10343        ss << "class '" << device_class << "' " << ts.str();
10344        goto reply_no_propose;
10345      }
10346
10347      // check if class is used by any erasure-code-profiles
10348      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10349        osdmap.get_erasure_code_profiles();
10350      auto ec_profiles = pending_inc.get_erasure_code_profiles();
10351 #ifdef HAVE_STDLIB_MAP_SPLICING
10352      ec_profiles.merge(old_ec_profiles);
10353 #else
10354      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10355                         make_move_iterator(end(old_ec_profiles)));
10356 #endif
10357      list<string> referenced_by;
10358      for (auto &i: ec_profiles) {
10359        for (auto &j: i.second) {
10360          if ("crush-device-class" == j.first && device_class == j.second) {
10361            referenced_by.push_back(i.first);
10362          }
10363        }
10364      }
10365      if (!referenced_by.empty()) {
10366        err = -EBUSY;
10367        ss << "class '" << device_class
10368           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10369        goto reply_no_propose;
10370      }
10371
10372      set<int> osds;
10373      newcrush.get_devices_by_class(device_class, &osds);
10374      for (auto& p: osds) {
10375        err = newcrush.remove_device_class(cct, p, &ss);
10376        if (err < 0) {
10377          // ss has reason for failure
10378          goto reply_no_propose;
10379        }
10380      }
10381
10382      if (osds.empty()) {
10383        // empty class, remove directly
10384        err = newcrush.remove_class_name(device_class);
10385        if (err < 0) {
10386          ss << "class '" << device_class << "' cannot be removed '"
10387             << cpp_strerror(err) << "'";
10388          goto reply_no_propose;
10389        }
10390      }
10391
10392      pending_inc.crush.clear();
10393      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10394      ss << "removed class " << device_class << " with id " << class_id
10395         << " from crush map";
10396      goto update;
10397   } else if (prefix == "osd crush class rename") {
10398     string srcname, dstname;
10399     if (!cmd_getval(cmdmap, "srcname", srcname)) {
10400       err = -EINVAL;
10401       goto reply_no_propose;
10402     }
10403     if (!cmd_getval(cmdmap, "dstname", dstname)) {
10404       err = -EINVAL;
10405       goto reply_no_propose;
10406     }
10407
10408     CrushWrapper newcrush = _get_pending_crush();
10409     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10410       // suppose this is a replay and return success
10411       // so command is idempotent
10412       ss << "already renamed to '" << dstname << "'";
10413       err = 0;
10414       goto reply_no_propose;
10415     }
10416
10417     err = newcrush.rename_class(srcname, dstname);
10418     if (err < 0) {
10419       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10420          << cpp_strerror(err);
10421       goto reply_no_propose;
10422     }
10423
10424     pending_inc.crush.clear();
10425     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10426     ss << "rename class '" << srcname << "' to '" << dstname << "'";
10427     goto update;
10428   } else if (prefix == "osd crush add-bucket") {
10429     // os crush add-bucket <name> <type>
10430     string name, typestr;
10431     vector<string> argvec;
10432     cmd_getval(cmdmap, "name", name);
10433     cmd_getval(cmdmap, "type", typestr);
10434     cmd_getval(cmdmap, "args", argvec);
10435     map<string,string> loc;
10436     if (!argvec.empty()) {
10437       CrushWrapper::parse_loc_map(argvec, &loc);
10438       dout(0) << "will create and move bucket '" << name
10439               << "' to location " << loc << dendl;
10440     }
10441
10442     if (!_have_pending_crush() &&
10443         _get_stable_crush().name_exists(name)) {
10444       ss << "bucket '" << name << "' already exists";
10445       goto reply_no_propose;
10446     }
10447
10448     CrushWrapper newcrush = _get_pending_crush();
10449
10450     if (newcrush.name_exists(name)) {
10451       ss << "bucket '" << name << "' already exists";
10452       goto update;
10453     }
10454     int type = newcrush.get_type_id(typestr);
10455     if (type < 0) {
10456       ss << "type '" << typestr << "' does not exist";
10457       err = -EINVAL;
10458       goto reply_no_propose;
10459     }
10460     if (type == 0) {
10461       ss << "type '" << typestr << "' is for devices, not buckets";
10462       err = -EINVAL;
10463       goto reply_no_propose;
10464     }
10465     int bucketno;
10466     err = newcrush.add_bucket(0, 0,
10467                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10468                               NULL, &bucketno);
10469     if (err < 0) {
10470       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10471       goto reply_no_propose;
10472     }
10473     err = newcrush.set_item_name(bucketno, name);
10474     if (err < 0) {
10475       ss << "error setting bucket name to '" << name << "'";
10476       goto reply_no_propose;
10477     }
10478
10479     if (!loc.empty()) {
10480       if (!newcrush.check_item_loc(cct, bucketno, loc,
10481           (int *)NULL)) {
10482         err = newcrush.move_bucket(cct, bucketno, loc);
10483         if (err < 0) {
10484           ss << "error moving bucket '" << name << "' to location " << loc;
10485           goto reply_no_propose;
10486         }
10487       } else {
10488         ss << "no need to move item id " << bucketno << " name '" << name
10489            << "' to location " << loc << " in crush map";
10490       }
10491     }
10492
10493     pending_inc.crush.clear();
10494     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10495     if (loc.empty()) {
10496       ss << "added bucket " << name << " type " << typestr
10497          << " to crush map";
10498     } else {
10499       ss << "added bucket " << name << " type " << typestr
10500          << " to location " << loc;
10501     }
10502     goto update;
10503   } else if (prefix == "osd crush rename-bucket") {
10504     string srcname, dstname;
10505     cmd_getval(cmdmap, "srcname", srcname);
10506     cmd_getval(cmdmap, "dstname", dstname);
10507
10508     err = crush_rename_bucket(srcname, dstname, &ss);
10509     if (err) {
10510       // equivalent to success for idempotency
10511       if (err == -EALREADY) {
10512         err = 0;
10513       }
10514       goto reply_no_propose;
10515     } else {
10516       goto update;
10517     }
10518   } else if (prefix == "osd crush weight-set create" ||
10519              prefix == "osd crush weight-set create-compat") {
10520     if (_have_pending_crush()) {
10521       dout(10) << " first waiting for pending crush changes to commit" << dendl;
10522       goto wait;
10523     }
10524     CrushWrapper newcrush = _get_pending_crush();
10525     int64_t pool;
10526     int positions;
10527     if (newcrush.has_non_straw2_buckets()) {
10528       ss << "crush map contains one or more bucket(s) that are not straw2";
10529       err = -EPERM;
10530       goto reply_no_propose;
10531     }
10532     if (prefix == "osd crush weight-set create") {
10533       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10534           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10535         ss << "require_min_compat_client "
10536            << osdmap.require_min_compat_client
10537            << " < luminous, which is required for per-pool weight-sets. "
10538            << "Try 'ceph osd set-require-min-compat-client luminous' "
10539            << "before using the new interface";
10540         err = -EPERM;
10541         goto reply_no_propose;
10542       }
10543       string poolname, mode;
10544       cmd_getval(cmdmap, "pool", poolname);
10545       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10546       if (pool < 0) {
10547         ss << "pool '" << poolname << "' not found";
10548         err = -ENOENT;
10549         goto reply_no_propose;
10550       }
10551       cmd_getval(cmdmap, "mode", mode);
10552       if (mode != "flat" && mode != "positional") {
10553         ss << "unrecognized weight-set mode '" << mode << "'";
10554         err = -EINVAL;
10555         goto reply_no_propose;
10556       }
10557       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10558     } else {
10559       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10560       positions = 1;
10561     }
10562     if (!newcrush.create_choose_args(pool, positions)) {
10563       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10564         ss << "compat weight-set already created";
10565       } else {
10566         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10567            << "' already created";
10568       }
10569       goto reply_no_propose;
10570     }
10571     pending_inc.crush.clear();
10572     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10573     goto update;
10574
10575   } else if (prefix == "osd crush weight-set rm" ||
10576              prefix == "osd crush weight-set rm-compat") {
10577     CrushWrapper newcrush = _get_pending_crush();
10578     int64_t pool;
10579     if (prefix == "osd crush weight-set rm") {
10580       string poolname;
10581       cmd_getval(cmdmap, "pool", poolname);
10582       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10583       if (pool < 0) {
10584         ss << "pool '" << poolname << "' not found";
10585         err = -ENOENT;
10586         goto reply_no_propose;
10587       }
10588     } else {
10589       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10590     }
10591     newcrush.rm_choose_args(pool);
10592     pending_inc.crush.clear();
10593     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10594     goto update;
10595
10596   } else if (prefix == "osd crush weight-set reweight" ||
10597              prefix == "osd crush weight-set reweight-compat") {
10598     string poolname, item;
10599     vector<double> weight;
10600     cmd_getval(cmdmap, "pool", poolname);
10601     cmd_getval(cmdmap, "item", item);
10602     cmd_getval(cmdmap, "weight", weight);
10603     CrushWrapper newcrush = _get_pending_crush();
10604     int64_t pool;
10605     if (prefix == "osd crush weight-set reweight") {
10606       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10607       if (pool < 0) {
10608         ss << "pool '" << poolname << "' not found";
10609         err = -ENOENT;
10610         goto reply_no_propose;
10611       }
10612       if (!newcrush.have_choose_args(pool)) {
10613         ss << "no weight-set for pool '" << poolname << "'";
10614         err = -ENOENT;
10615         goto reply_no_propose;
10616       }
10617       auto arg_map = newcrush.choose_args_get(pool);
10618       int positions = newcrush.get_choose_args_positions(arg_map);
10619       if (weight.size() != (size_t)positions) {
10620          ss << "must specify exact " << positions << " weight values";
10621          err = -EINVAL;
10622          goto reply_no_propose;
10623       }
10624     } else {
10625       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10626       if (!newcrush.have_choose_args(pool)) {
10627         ss << "no backward-compatible weight-set";
10628         err = -ENOENT;
10629         goto reply_no_propose;
10630       }
10631     }
10632     if (!newcrush.name_exists(item)) {
10633       ss << "item '" << item << "' does not exist";
10634       err = -ENOENT;
10635       goto reply_no_propose;
10636     }
10637     err = newcrush.choose_args_adjust_item_weightf(
10638       cct,
10639       newcrush.choose_args_get(pool),
10640       newcrush.get_item_id(item),
10641       weight,
10642       &ss);
10643     if (err < 0) {
10644       goto reply_no_propose;
10645     }
10646     err = 0;
10647     pending_inc.crush.clear();
10648     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10649     goto update;
10650   } else if (osdid_present &&
10651              (prefix == "osd crush set" || prefix == "osd crush add")) {
10652     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10653     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10654     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10655
10656     if (!osdmap.exists(osdid)) {
10657       err = -ENOENT;
10658       ss << osd_name
10659          << " does not exist. Create it before updating the crush map";
10660       goto reply_no_propose;
10661     }
10662
10663     double weight;
10664     if (!cmd_getval(cmdmap, "weight", weight)) {
10665       ss << "unable to parse weight value '"
10666          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10667       err = -EINVAL;
10668       goto reply_no_propose;
10669     }
10670
10671     string args;
10672     vector<string> argvec;
10673     cmd_getval(cmdmap, "args", argvec);
10674     map<string,string> loc;
10675     CrushWrapper::parse_loc_map(argvec, &loc);
10676
10677     if (prefix == "osd crush set"
10678         && !_get_stable_crush().item_exists(osdid)) {
10679       err = -ENOENT;
10680       ss << "unable to set item id " << osdid << " name '" << osd_name
10681          << "' weight " << weight << " at location " << loc
10682          << ": does not exist";
10683       goto reply_no_propose;
10684     }
10685
10686     dout(5) << "adding/updating crush item id " << osdid << " name '"
10687       << osd_name << "' weight " << weight << " at location "
10688       << loc << dendl;
10689     CrushWrapper newcrush = _get_pending_crush();
10690
10691     string action;
10692     if (prefix == "osd crush set" ||
10693         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10694       action = "set";
10695       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10696     } else {
10697       action = "add";
10698       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10699       if (err == 0)
10700         err = 1;
10701     }
10702
10703     if (err < 0)
10704       goto reply_no_propose;
10705
10706     if (err == 0 && !_have_pending_crush()) {
10707       ss << action << " item id " << osdid << " name '" << osd_name
10708          << "' weight " << weight << " at location " << loc << ": no change";
10709       goto reply_no_propose;
10710     }
10711
10712     pending_inc.crush.clear();
10713     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10714     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10715        << weight << " at location " << loc << " to crush map";
10716     getline(ss, rs);
10717     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10718                                                       get_last_committed() + 1));
10719     return true;
10720
10721   } else if (prefix == "osd crush create-or-move") {
10722     do {
10723       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10724       if (!osdmap.exists(osdid)) {
10725         err = -ENOENT;
10726         ss << osd_name
10727            << " does not exist.  create it before updating the crush map";
10728         goto reply_no_propose;
10729       }
10730
10731       double weight;
10732       if (!cmd_getval(cmdmap, "weight", weight)) {
10733         ss << "unable to parse weight value '"
10734            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10735         err = -EINVAL;
10736         goto reply_no_propose;
10737       }
10738
10739       string args;
10740       vector<string> argvec;
10741       cmd_getval(cmdmap, "args", argvec);
10742       map<string,string> loc;
10743       CrushWrapper::parse_loc_map(argvec, &loc);
10744
10745       dout(0) << "create-or-move crush item name '" << osd_name
10746               << "' initial_weight " << weight << " at location " << loc
10747               << dendl;
10748
10749       CrushWrapper newcrush = _get_pending_crush();
10750
10751       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10752                                          g_conf()->osd_crush_update_weight_set);
10753       if (err == 0) {
10754         ss << "create-or-move updated item name '" << osd_name
10755            << "' weight " << weight
10756            << " at location " << loc << " to crush map";
10757         break;
10758       }
10759       if (err > 0) {
10760         pending_inc.crush.clear();
10761         newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10762         ss << "create-or-move updating item name '" << osd_name
10763            << "' weight " << weight
10764            << " at location " << loc << " to crush map";
10765         getline(ss, rs);
10766         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10767                                                   get_last_committed() + 1));
10768         return true;
10769       }
10770     } while (false);
10771
10772   } else if (prefix == "osd crush move") {
10773     do {
10774       // osd crush move <name> <loc1> [<loc2> ...]
10775       string name;
10776       vector<string> argvec;
10777       cmd_getval(cmdmap, "name", name);
10778       cmd_getval(cmdmap, "args", argvec);
10779       map<string,string> loc;
10780       CrushWrapper::parse_loc_map(argvec, &loc);
10781
10782       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10783       CrushWrapper newcrush = _get_pending_crush();
10784
10785       if (!newcrush.name_exists(name)) {
10786         err = -ENOENT;
10787         ss << "item " << name << " does not exist";
10788         break;
10789       }
10790       int id = newcrush.get_item_id(name);
10791
10792       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10793         if (id >= 0) {
10794           err = newcrush.create_or_move_item(
10795             cct, id, 0, name, loc,
10796             g_conf()->osd_crush_update_weight_set);
10797         } else {
10798           err = newcrush.move_bucket(cct, id, loc);
10799         }
10800         if (err >= 0) {
10801           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10802           pending_inc.crush.clear();
10803           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10804           getline(ss, rs);
10805           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10806                                                    get_last_committed() + 1));
10807           return true;
10808         }
10809       } else {
10810         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10811         err = 0;
10812       }
10813     } while (false);
10814   } else if (prefix == "osd crush swap-bucket") {
10815     string source, dest;
10816     cmd_getval(cmdmap, "source", source);
10817     cmd_getval(cmdmap, "dest", dest);
10818
10819     bool force = false;
10820     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10821
10822     CrushWrapper newcrush = _get_pending_crush();
10823     if (!newcrush.name_exists(source)) {
10824       ss << "source item " << source << " does not exist";
10825       err = -ENOENT;
10826       goto reply_no_propose;
10827     }
10828     if (!newcrush.name_exists(dest)) {
10829       ss << "dest item " << dest << " does not exist";
10830       err = -ENOENT;
10831       goto reply_no_propose;
10832     }
10833     int sid = newcrush.get_item_id(source);
10834     int did = newcrush.get_item_id(dest);
10835     int sparent;
10836     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10837       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10838       err = -EPERM;
10839       goto reply_no_propose;
10840     }
10841     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10842         !force) {
10843       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10844          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10845          << "; pass --yes-i-really-mean-it to proceed anyway";
10846       err = -EPERM;
10847       goto reply_no_propose;
10848     }
10849     int r = newcrush.swap_bucket(cct, sid, did);
10850     if (r < 0) {
10851       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10852       err = r;
10853       goto reply_no_propose;
10854     }
10855     ss << "swapped bucket of " << source << " to " << dest;
10856     pending_inc.crush.clear();
10857     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10858     wait_for_finished_proposal(op,
10859                                new Monitor::C_Command(mon, op, err, ss.str(),
10860                                                       get_last_committed() + 1));
10861     return true;
10862   } else if (prefix == "osd crush link") {
10863     // osd crush link <name> <loc1> [<loc2> ...]
10864     string name;
10865     cmd_getval(cmdmap, "name", name);
10866     vector<string> argvec;
10867     cmd_getval(cmdmap, "args", argvec);
10868     map<string,string> loc;
10869     CrushWrapper::parse_loc_map(argvec, &loc);
10870
10871     // Need an explicit check for name_exists because get_item_id returns
10872     // 0 on unfound.
10873     int id = osdmap.crush->get_item_id(name);
10874     if (!osdmap.crush->name_exists(name)) {
10875       err = -ENOENT;
10876       ss << "item " << name << " does not exist";
10877       goto reply_no_propose;
10878     } else {
10879       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10880     }
10881     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10882       ss << "no need to move item id " << id << " name '" << name
10883          << "' to location " << loc << " in crush map";
10884       err = 0;
10885       goto reply_no_propose;
10886     }
10887
10888     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10889     CrushWrapper newcrush = _get_pending_crush();
10890
10891     if (!newcrush.name_exists(name)) {
10892       err = -ENOENT;
10893       ss << "item " << name << " does not exist";
10894       goto reply_no_propose;
10895     } else {
10896       int id = newcrush.get_item_id(name);
10897       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10898         err = newcrush.link_bucket(cct, id, loc);
10899         if (err >= 0) {
10900           ss << "linked item id " << id << " name '" << name
10901              << "' to location " << loc << " in crush map";
10902           pending_inc.crush.clear();
10903           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10904         } else {
10905           ss << "cannot link item id " << id << " name '" << name
10906              << "' to location " << loc;
10907           goto reply_no_propose;
10908         }
10909       } else {
10910         ss << "no need to move item id " << id << " name '" << name
10911            << "' to location " << loc << " in crush map";
10912         err = 0;
10913       }
10914     }
10915     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10916                                               get_last_committed() + 1));
10917     return true;
10918   } else if (prefix == "osd crush rm" ||
10919              prefix == "osd crush remove" ||
10920              prefix == "osd crush unlink") {
10921     do {
10922       // osd crush rm <id> [ancestor]
10923       CrushWrapper newcrush = _get_pending_crush();
10924
10925       string name;
10926       cmd_getval(cmdmap, "name", name);
10927
10928       if (!osdmap.crush->name_exists(name)) {
10929         err = 0;
10930         ss << "device '" << name << "' does not appear in the crush map";
10931         break;
10932       }
10933       if (!newcrush.name_exists(name)) {
10934         err = 0;
10935         ss << "device '" << name << "' does not appear in the crush map";
10936         getline(ss, rs);
10937         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10938                                                   get_last_committed() + 1));
10939         return true;
10940       }
10941       int id = newcrush.get_item_id(name);
10942       int ancestor = 0;
10943
10944       bool unlink_only = prefix == "osd crush unlink";
10945       string ancestor_str;
10946       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10947         if (!newcrush.name_exists(ancestor_str)) {
10948           err = -ENOENT;
10949           ss << "ancestor item '" << ancestor_str
10950              << "' does not appear in the crush map";
10951           break;
10952         }
10953         ancestor = newcrush.get_item_id(ancestor_str);
10954       }
10955
10956       err = prepare_command_osd_crush_remove(
10957           newcrush,
10958           id, ancestor,
10959           (ancestor < 0), unlink_only);
10960
10961       if (err == -ENOENT) {
10962         ss << "item " << id << " does not appear in that position";
10963         err = 0;
10964         break;
10965       }
10966       if (err == 0) {
10967         if (!unlink_only)
10968           pending_inc.new_crush_node_flags[id] = 0;
10969         ss << "removed item id " << id << " name '" << name << "' from crush map";
10970         getline(ss, rs);
10971         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10972                                                   get_last_committed() + 1));
10973         return true;
10974       }
10975     } while (false);
10976
10977   } else if (prefix == "osd crush reweight-all") {
10978     CrushWrapper newcrush = _get_pending_crush();
10979
10980     newcrush.reweight(cct);
10981     pending_inc.crush.clear();
10982     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10983     ss << "reweighted crush hierarchy";
10984     getline(ss, rs);
10985     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10986                                                   get_last_committed() + 1));
10987     return true;
10988   } else if (prefix == "osd crush reweight") {
10989     // osd crush reweight <name> <weight>
10990     CrushWrapper newcrush = _get_pending_crush();
10991
10992     string name;
10993     cmd_getval(cmdmap, "name", name);
10994     if (!newcrush.name_exists(name)) {
10995       err = -ENOENT;
10996       ss << "device '" << name << "' does not appear in the crush map";
10997       goto reply_no_propose;
10998     }
10999
11000     int id = newcrush.get_item_id(name);
11001     if (id < 0) {
11002       ss << "device '" << name << "' is not a leaf in the crush map";
11003       err = -EINVAL;
11004       goto reply_no_propose;
11005     }
11006     double w;
11007     if (!cmd_getval(cmdmap, "weight", w)) {
11008       ss << "unable to parse weight value '"
11009          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11010       err = -EINVAL;
11011       goto reply_no_propose;
11012     }
11013
11014     err = newcrush.adjust_item_weightf(cct, id, w,
11015                                        g_conf()->osd_crush_update_weight_set);
11016     if (err < 0)
11017       goto reply_no_propose;
11018     pending_inc.crush.clear();
11019     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11020     ss << "reweighted item id " << id << " name '" << name << "' to " << w
11021        << " in crush map";
11022     getline(ss, rs);
11023     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11024                                                   get_last_committed() + 1));
11025     return true;
11026   } else if (prefix == "osd crush reweight-subtree") {
11027     // osd crush reweight <name> <weight>
11028     CrushWrapper newcrush = _get_pending_crush();
11029
11030     string name;
11031     cmd_getval(cmdmap, "name", name);
11032     if (!newcrush.name_exists(name)) {
11033       err = -ENOENT;
11034       ss << "device '" << name << "' does not appear in the crush map";
11035       goto reply_no_propose;
11036     }
11037
11038     int id = newcrush.get_item_id(name);
11039     if (id >= 0) {
11040       ss << "device '" << name << "' is not a subtree in the crush map";
11041       err = -EINVAL;
11042       goto reply_no_propose;
11043     }
11044     double w;
11045     if (!cmd_getval(cmdmap, "weight", w)) {
11046       ss << "unable to parse weight value '"
11047          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
11048       err = -EINVAL;
11049       goto reply_no_propose;
11050     }
11051
11052     err = newcrush.adjust_subtree_weightf(cct, id, w,
11053                                           g_conf()->osd_crush_update_weight_set);
11054     if (err < 0)
11055       goto reply_no_propose;
11056     pending_inc.crush.clear();
11057     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11058     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
11059        << " in crush map";
11060     getline(ss, rs);
11061     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11062                                               get_last_committed() + 1));
11063     return true;
11064   } else if (prefix == "osd crush tunables") {
11065     CrushWrapper newcrush = _get_pending_crush();
11066
11067     err = 0;
11068     string profile;
11069     cmd_getval(cmdmap, "profile", profile);
11070     if (profile == "legacy" || profile == "argonaut") {
11071       newcrush.set_tunables_legacy();
11072     } else if (profile == "bobtail") {
11073       newcrush.set_tunables_bobtail();
11074     } else if (profile == "firefly") {
11075       newcrush.set_tunables_firefly();
11076     } else if (profile == "hammer") {
11077       newcrush.set_tunables_hammer();
11078     } else if (profile == "jewel") {
11079       newcrush.set_tunables_jewel();
11080     } else if (profile == "optimal") {
11081       newcrush.set_tunables_optimal();
11082     } else if (profile == "default") {
11083       newcrush.set_tunables_default();
11084     } else {
11085       ss << "unrecognized profile '" << profile << "'";
11086       err = -EINVAL;
11087       goto reply_no_propose;
11088     }
11089
11090     if (!validate_crush_against_features(&newcrush, ss)) {
11091       err = -EINVAL;
11092       goto reply_no_propose;
11093     }
11094
11095     pending_inc.crush.clear();
11096     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11097     ss << "adjusted tunables profile to " << profile;
11098     getline(ss, rs);
11099     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11100                                               get_last_committed() + 1));
11101     return true;
11102   } else if (prefix == "osd crush set-tunable") {
11103     CrushWrapper newcrush = _get_pending_crush();
11104
11105     err = 0;
11106     string tunable;
11107     cmd_getval(cmdmap, "tunable", tunable);
11108
11109     int64_t value = -1;
11110     if (!cmd_getval(cmdmap, "value", value)) {
11111       err = -EINVAL;
11112       ss << "failed to parse integer value "
11113          << cmd_vartype_stringify(cmdmap.at("value"));
11114       goto reply_no_propose;
11115     }
11116
11117     if (tunable == "straw_calc_version") {
11118       if (value != 0 && value != 1) {
11119         ss << "value must be 0 or 1; got " << value;
11120         err = -EINVAL;
11121         goto reply_no_propose;
11122       }
11123       newcrush.set_straw_calc_version(value);
11124     } else {
11125       ss << "unrecognized tunable '" << tunable << "'";
11126       err = -EINVAL;
11127       goto reply_no_propose;
11128     }
11129
11130     if (!validate_crush_against_features(&newcrush, ss)) {
11131       err = -EINVAL;
11132       goto reply_no_propose;
11133     }
11134
11135     pending_inc.crush.clear();
11136     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11137     ss << "adjusted tunable " << tunable << " to " << value;
11138     getline(ss, rs);
11139     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11140                                               get_last_committed() + 1));
11141     return true;
11142
11143   } else if (prefix == "osd crush rule create-simple") {
11144     string name, root, type, mode;
11145     cmd_getval(cmdmap, "name", name);
11146     cmd_getval(cmdmap, "root", root);
11147     cmd_getval(cmdmap, "type", type);
11148     cmd_getval(cmdmap, "mode", mode);
11149     if (mode == "")
11150       mode = "firstn";
11151
11152     if (osdmap.crush->rule_exists(name)) {
11153       // The name is uniquely associated to a ruleid and the rule it contains
11154       // From the user point of view, the rule is more meaningfull.
11155       ss << "rule " << name << " already exists";
11156       err = 0;
11157       goto reply_no_propose;
11158     }
11159
11160     CrushWrapper newcrush = _get_pending_crush();
11161
11162     if (newcrush.rule_exists(name)) {
11163       // The name is uniquely associated to a ruleid and the rule it contains
11164       // From the user point of view, the rule is more meaningfull.
11165       ss << "rule " << name << " already exists";
11166       err = 0;
11167     } else {
11168       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11169                                                pg_pool_t::TYPE_REPLICATED, &ss);
11170       if (ruleno < 0) {
11171         err = ruleno;
11172         goto reply_no_propose;
11173       }
11174
11175       pending_inc.crush.clear();
11176       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11177     }
11178     getline(ss, rs);
11179     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11180                                               get_last_committed() + 1));
11181     return true;
11182
11183   } else if (prefix == "osd crush rule create-replicated") {
11184     string name, root, type, device_class;
11185     cmd_getval(cmdmap, "name", name);
11186     cmd_getval(cmdmap, "root", root);
11187     cmd_getval(cmdmap, "type", type);
11188     cmd_getval(cmdmap, "class", device_class);
11189
11190     if (osdmap.crush->rule_exists(name)) {
11191       // The name is uniquely associated to a ruleid and the rule it contains
11192       // From the user point of view, the rule is more meaningfull.
11193       ss << "rule " << name << " already exists";
11194       err = 0;
11195       goto reply_no_propose;
11196     }
11197
11198     CrushWrapper newcrush = _get_pending_crush();
11199
11200     if (newcrush.rule_exists(name)) {
11201       // The name is uniquely associated to a ruleid and the rule it contains
11202       // From the user point of view, the rule is more meaningfull.
11203       ss << "rule " << name << " already exists";
11204       err = 0;
11205     } else {
11206       int ruleno = newcrush.add_simple_rule(
11207         name, root, type, device_class,
11208         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11209       if (ruleno < 0) {
11210         err = ruleno;
11211         goto reply_no_propose;
11212       }
11213
11214       pending_inc.crush.clear();
11215       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11216     }
11217     getline(ss, rs);
11218     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11219                                               get_last_committed() + 1));
11220     return true;
11221
11222   } else if (prefix == "osd erasure-code-profile rm") {
11223     string name;
11224     cmd_getval(cmdmap, "name", name);
11225
11226     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11227       goto wait;
11228
11229     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11230       err = -EBUSY;
11231       goto reply_no_propose;
11232     }
11233
11234     if (osdmap.has_erasure_code_profile(name) ||
11235         pending_inc.new_erasure_code_profiles.count(name)) {
11236       if (osdmap.has_erasure_code_profile(name)) {
11237         pending_inc.old_erasure_code_profiles.push_back(name);
11238       } else {
11239         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11240         pending_inc.new_erasure_code_profiles.erase(name);
11241       }
11242
11243       getline(ss, rs);
11244       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11245                                                         get_last_committed() + 1));
11246       return true;
11247     } else {
11248       ss << "erasure-code-profile " << name << " does not exist";
11249       err = 0;
11250       goto reply_no_propose;
11251     }
11252
11253   } else if (prefix == "osd erasure-code-profile set") {
11254     string name;
11255     cmd_getval(cmdmap, "name", name);
11256     vector<string> profile;
11257     cmd_getval(cmdmap, "profile", profile);
11258
11259     bool force = false;
11260     cmd_getval(cmdmap, "force", force);
11261
11262     map<string,string> profile_map;
11263     err = parse_erasure_code_profile(profile, &profile_map, &ss);
11264     if (err)
11265       goto reply_no_propose;
11266     if (auto found = profile_map.find("crush-failure-domain");
11267         found != profile_map.end()) {
11268       const auto& failure_domain = found->second;
11269       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11270       if (failure_domain_type < 0) {
11271         ss << "erasure-code-profile " << profile_map
11272           << " contains an invalid failure-domain " << std::quoted(failure_domain);
11273         err = -EINVAL;
11274         goto reply_no_propose;
11275       }
11276     }
11277
11278     if (profile_map.find("plugin") == profile_map.end()) {
11279       ss << "erasure-code-profile " << profile_map
11280          << " must contain a plugin entry" << std::endl;
11281       err = -EINVAL;
11282       goto reply_no_propose;
11283     }
11284     string plugin = profile_map["plugin"];
11285
11286     if (pending_inc.has_erasure_code_profile(name)) {
11287       dout(20) << "erasure code profile " << name << " try again" << dendl;
11288       goto wait;
11289     } else {
11290       err = normalize_profile(name, profile_map, force, &ss);
11291       if (err)
11292         goto reply_no_propose;
11293
11294       if (osdmap.has_erasure_code_profile(name)) {
11295         ErasureCodeProfile existing_profile_map =
11296           osdmap.get_erasure_code_profile(name);
11297         err = normalize_profile(name, existing_profile_map, force, &ss);
11298         if (err)
11299           goto reply_no_propose;
11300
11301         if (existing_profile_map == profile_map) {
11302           err = 0;
11303           goto reply_no_propose;
11304         }
11305         if (!force) {
11306           err = -EPERM;
11307           ss << "will not override erasure code profile " << name
11308              << " because the existing profile "
11309              << existing_profile_map
11310              << " is different from the proposed profile "
11311              << profile_map;
11312           goto reply_no_propose;
11313         }
11314       }
11315
11316       dout(20) << "erasure code profile set " << name << "="
11317                << profile_map << dendl;
11318       pending_inc.set_erasure_code_profile(name, profile_map);
11319     }
11320
11321     getline(ss, rs);
11322     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11323                                                       get_last_committed() + 1));
11324     return true;
11325
11326   } else if (prefix == "osd crush rule create-erasure") {
11327     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11328     if (err == -EAGAIN)
11329       goto wait;
11330     if (err)
11331       goto reply_no_propose;
11332     string name, poolstr;
11333     cmd_getval(cmdmap, "name", name);
11334     string profile;
11335     cmd_getval(cmdmap, "profile", profile);
11336     if (profile == "")
11337       profile = "default";
11338     if (profile == "default") {
11339       if (!osdmap.has_erasure_code_profile(profile)) {
11340         if (pending_inc.has_erasure_code_profile(profile)) {
11341           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11342           goto wait;
11343         }
11344
11345         map<string,string> profile_map;
11346         err = osdmap.get_erasure_code_profile_default(cct,
11347                                                       profile_map,
11348                                                       &ss);
11349         if (err)
11350           goto reply_no_propose;
11351         err = normalize_profile(name, profile_map, true, &ss);
11352         if (err)
11353           goto reply_no_propose;
11354         dout(20) << "erasure code profile set " << profile << "="
11355                  << profile_map << dendl;
11356         pending_inc.set_erasure_code_profile(profile, profile_map);
11357         goto wait;
11358       }
11359     }
11360
11361     int rule;
11362     err = crush_rule_create_erasure(name, profile, &rule, &ss);
11363     if (err < 0) {
11364       switch(err) {
11365       case -EEXIST: // return immediately
11366         ss << "rule " << name << " already exists";
11367         err = 0;
11368         goto reply_no_propose;
11369       case -EALREADY: // wait for pending to be proposed
11370         ss << "rule " << name << " already exists";
11371         err = 0;
11372         break;
11373       default: // non recoverable error
11374         goto reply_no_propose;
11375       }
11376     } else {
11377       ss << "created rule " << name << " at " << rule;
11378     }
11379
11380     getline(ss, rs);
11381     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11382                                                       get_last_committed() + 1));
11383     return true;
11384
11385   } else if (prefix == "osd crush rule rm") {
11386     string name;
11387     cmd_getval(cmdmap, "name", name);
11388
11389     if (!osdmap.crush->rule_exists(name)) {
11390       ss << "rule " << name << " does not exist";
11391       err = 0;
11392       goto reply_no_propose;
11393     }
11394
11395     CrushWrapper newcrush = _get_pending_crush();
11396
11397     if (!newcrush.rule_exists(name)) {
11398       ss << "rule " << name << " does not exist";
11399       err = 0;
11400     } else {
11401       int ruleno = newcrush.get_rule_id(name);
11402       ceph_assert(ruleno >= 0);
11403
11404       // make sure it is not in use.
11405       // FIXME: this is ok in some situations, but let's not bother with that
11406       // complexity now.
11407       if (osdmap.crush_rule_in_use(ruleno)) {
11408         ss << "crush rule " << name << " (" << ruleno << ") is in use";
11409         err = -EBUSY;
11410         goto reply_no_propose;
11411       }
11412
11413       err = newcrush.remove_rule(ruleno);
11414       if (err < 0) {
11415         goto reply_no_propose;
11416       }
11417
11418       pending_inc.crush.clear();
11419       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11420     }
11421     getline(ss, rs);
11422     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11423                                               get_last_committed() + 1));
11424     return true;
11425
11426   } else if (prefix == "osd crush rule rename") {
11427     string srcname;
11428     string dstname;
11429     cmd_getval(cmdmap, "srcname", srcname);
11430     cmd_getval(cmdmap, "dstname", dstname);
11431     if (srcname.empty() || dstname.empty()) {
11432       ss << "must specify both source rule name and destination rule name";
11433       err = -EINVAL;
11434       goto reply_no_propose;
11435     }
11436     if (srcname == dstname) {
11437       ss << "destination rule name is equal to source rule name";
11438       err = 0;
11439       goto reply_no_propose;
11440     }
11441
11442     CrushWrapper newcrush = _get_pending_crush();
11443     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11444       // srcname does not exist and dstname already exists
11445       // suppose this is a replay and return success
11446       // (so this command is idempotent)
11447       ss << "already renamed to '" << dstname << "'";
11448       err = 0;
11449       goto reply_no_propose;
11450     }
11451
11452     err = newcrush.rename_rule(srcname, dstname, &ss);
11453     if (err < 0) {
11454       // ss has reason for failure
11455       goto reply_no_propose;
11456     }
11457     pending_inc.crush.clear();
11458     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11459     getline(ss, rs);
11460     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11461                                get_last_committed() + 1));
11462     return true;
11463
11464   } else if (prefix == "osd setmaxosd") {
11465     int64_t newmax;
11466     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11467       ss << "unable to parse 'newmax' value '"
11468          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11469       err = -EINVAL;
11470       goto reply_no_propose;
11471     }
11472
11473     if (newmax > g_conf()->mon_max_osd) {
11474       err = -ERANGE;
11475       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11476          << g_conf()->mon_max_osd << ")";
11477       goto reply_no_propose;
11478     }
11479
11480     // Don't allow shrinking OSD number as this will cause data loss
11481     // and may cause kernel crashes.
11482     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11483     if (newmax < osdmap.get_max_osd()) {
11484       // Check if the OSDs exist between current max and new value.
11485       // If there are any OSDs exist, then don't allow shrinking number
11486       // of OSDs.
11487       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11488         if (osdmap.exists(i)) {
11489           err = -EBUSY;
11490           ss << "cannot shrink max_osd to " << newmax
11491              << " because osd." << i << " (and possibly others) still in use";
11492           goto reply_no_propose;
11493         }
11494       }
11495     }
11496
11497     pending_inc.new_max_osd = newmax;
11498     ss << "set new max_osd = " << pending_inc.new_max_osd;
11499     getline(ss, rs);
11500     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11501                                               get_last_committed() + 1));
11502     return true;
11503
11504   } else if (prefix == "osd set-full-ratio" ||
11505              prefix == "osd set-backfillfull-ratio" ||
11506              prefix == "osd set-nearfull-ratio") {
11507     double n;
11508     if (!cmd_getval(cmdmap, "ratio", n)) {
11509       ss << "unable to parse 'ratio' value '"
11510          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11511       err = -EINVAL;
11512       goto reply_no_propose;
11513     }
11514     if (prefix == "osd set-full-ratio")
11515       pending_inc.new_full_ratio = n;
11516     else if (prefix == "osd set-backfillfull-ratio")
11517       pending_inc.new_backfillfull_ratio = n;
11518     else if (prefix == "osd set-nearfull-ratio")
11519       pending_inc.new_nearfull_ratio = n;
11520     ss << prefix << " " << n;
11521     getline(ss, rs);
11522     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11523                                               get_last_committed() + 1));
11524     return true;
11525   } else if (prefix == "osd set-require-min-compat-client") {
11526     string v;
11527     cmd_getval(cmdmap, "version", v);
11528     ceph_release_t vno = ceph_release_from_name(v);
11529     if (!vno) {
11530       ss << "version " << v << " is not recognized";
11531       err = -EINVAL;
11532       goto reply_no_propose;
11533     }
11534     OSDMap newmap;
11535     newmap.deepish_copy_from(osdmap);
11536     newmap.apply_incremental(pending_inc);
11537     newmap.require_min_compat_client = vno;
11538     auto mvno = newmap.get_min_compat_client();
11539     if (vno < mvno) {
11540       ss << "osdmap current utilizes features that require " << mvno
11541          << "; cannot set require_min_compat_client below that to " << vno;
11542       err = -EPERM;
11543       goto reply_no_propose;
11544     }
11545     bool sure = false;
11546     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11547     if (!sure) {
11548       FeatureMap m;
11549       mon.get_combined_feature_map(&m);
11550       uint64_t features = ceph_release_features(to_integer<int>(vno));
11551       bool first = true;
11552       bool ok = true;
11553       for (int type : {
11554             CEPH_ENTITY_TYPE_CLIENT,
11555             CEPH_ENTITY_TYPE_MDS,
11556             CEPH_ENTITY_TYPE_MGR }) {
11557         auto p = m.m.find(type);
11558         if (p == m.m.end()) {
11559           continue;
11560         }
11561         for (auto& q : p->second) {
11562           uint64_t missing = ~q.first & features;
11563           if (missing) {
11564             if (first) {
11565               ss << "cannot set require_min_compat_client to " << v << ": ";
11566             } else {
11567               ss << "; ";
11568             }
11569             first = false;
11570             ss << q.second << " connected " << ceph_entity_type_name(type)
11571                << "(s) look like " << ceph_release_name(
11572                  ceph_release_from_features(q.first))
11573                << " (missing 0x" << std::hex << missing << std::dec << ")";
11574             ok = false;
11575           }
11576         }
11577       }
11578       if (!ok) {
11579         ss << "; add --yes-i-really-mean-it to do it anyway";
11580         err = -EPERM;
11581         goto reply_no_propose;
11582       }
11583     }
11584     ss << "set require_min_compat_client to " << vno;
11585     pending_inc.new_require_min_compat_client = vno;
11586     getline(ss, rs);
11587     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11588                                                           get_last_committed() + 1));
11589     return true;
11590   } else if (prefix == "osd pause") {
11591     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11592
11593   } else if (prefix == "osd unpause") {
11594     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11595
11596   } else if (prefix == "osd set") {
11597     bool sure = false;
11598     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11599
11600     string key;
11601     cmd_getval(cmdmap, "key", key);
11602     if (key == "pause")
11603       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11604     else if (key == "noup")
11605       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11606     else if (key == "nodown")
11607       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11608     else if (key == "noout")
11609       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11610     else if (key == "noin")
11611       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11612     else if (key == "nobackfill")
11613       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11614     else if (key == "norebalance")
11615       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11616     else if (key == "norecover")
11617       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11618     else if (key == "noscrub")
11619       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11620     else if (key == "nodeep-scrub")
11621       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11622     else if (key == "notieragent")
11623       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11624     else if (key == "nosnaptrim")
11625       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11626     else if (key == "pglog_hardlimit") {
11627       if (!osdmap.get_num_up_osds() && !sure) {
11628         ss << "Not advisable to continue since no OSDs are up. Pass "
11629            << "--yes-i-really-mean-it if you really wish to continue.";
11630         err = -EPERM;
11631         goto reply_no_propose;
11632       }
11633       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11634       // we are reusing a jewel feature bit that was retired in luminous.
11635       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11636          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11637           || sure)) {
11638         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11639       } else {
11640         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11641         err = -EPERM;
11642         goto reply_no_propose;
11643       }
11644     } else if (key == "noautoscale") {
11645       return prepare_set_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
11646     } else {
11647       ss << "unrecognized flag '" << key << "'";
11648       err = -EINVAL;
11649     }
11650
11651   } else if (prefix == "osd unset") {
11652     string key;
11653     cmd_getval(cmdmap, "key", key);
11654     if (key == "pause")
11655       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11656     else if (key == "noup")
11657       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11658     else if (key == "nodown")
11659       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11660     else if (key == "noout")
11661       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11662     else if (key == "noin")
11663       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11664     else if (key == "nobackfill")
11665       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11666     else if (key == "norebalance")
11667       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11668     else if (key == "norecover")
11669       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11670     else if (key == "noscrub")
11671       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11672     else if (key == "nodeep-scrub")
11673       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11674     else if (key == "notieragent")
11675       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11676     else if (key == "nosnaptrim")
11677       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11678     else if (key == "noautoscale")
11679       return prepare_unset_flag(op, CEPH_OSDMAP_NOAUTOSCALE);
11680     else {
11681       ss << "unrecognized flag '" << key << "'";
11682       err = -EINVAL;
11683     }
11684
11685   } else if (prefix == "osd require-osd-release") {
11686     string release;
11687     cmd_getval(cmdmap, "release", release);
11688     bool sure = false;
11689     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11690     ceph_release_t rel = ceph_release_from_name(release.c_str());
11691     if (!rel) {
11692       ss << "unrecognized release " << release;
11693       err = -EINVAL;
11694       goto reply_no_propose;
11695     }
11696     if (rel == osdmap.require_osd_release) {
11697       // idempotent
11698       err = 0;
11699       goto reply_no_propose;
11700     }
11701     if (osdmap.require_osd_release < ceph_release_t::pacific && !sure) {
11702       ss << "Not advisable to continue since current 'require_osd_release' "
11703          << "refers to a very old Ceph release. Pass "
11704          << "--yes-i-really-mean-it if you really wish to continue.";
11705       err = -EPERM;
11706       goto reply_no_propose;
11707     }
11708     if (!osdmap.get_num_up_osds() && !sure) {
11709       ss << "Not advisable to continue since no OSDs are up. Pass "
11710          << "--yes-i-really-mean-it if you really wish to continue.";
11711       err = -EPERM;
11712       goto reply_no_propose;
11713     }
11714     if (rel == ceph_release_t::pacific) {
11715       if (!mon.monmap->get_required_features().contains_all(
11716             ceph::features::mon::FEATURE_PACIFIC)) {
11717         ss << "not all mons are pacific";
11718         err = -EPERM;
11719         goto reply_no_propose;
11720       }
11721       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11722            && !sure) {
11723         ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11724         err = -EPERM;
11725         goto reply_no_propose;
11726       }
11727     } else if (rel == ceph_release_t::quincy) {
11728       if (!mon.monmap->get_required_features().contains_all(
11729             ceph::features::mon::FEATURE_QUINCY)) {
11730         ss << "not all mons are quincy";
11731         err = -EPERM;
11732         goto reply_no_propose;
11733       }
11734       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11735            && !sure) {
11736         ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11737         err = -EPERM;
11738         goto reply_no_propose;
11739       }
11740     } else if (rel == ceph_release_t::reef) {
11741       if (!mon.monmap->get_required_features().contains_all(
11742             ceph::features::mon::FEATURE_REEF)) {
11743         ss << "not all mons are reef";
11744         err = -EPERM;
11745         goto reply_no_propose;
11746       }
11747       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_REEF))
11748            && !sure) {
11749         ss << "not all up OSDs have CEPH_FEATURE_SERVER_REEF feature";
11750         err = -EPERM;
11751         goto reply_no_propose;
11752       }
11753     } else {
11754       ss << "not supported for this release";
11755       err = -EPERM;
11756       goto reply_no_propose;
11757     }
11758     if (rel < osdmap.require_osd_release) {
11759       ss << "require_osd_release cannot be lowered once it has been set";
11760       err = -EPERM;
11761       goto reply_no_propose;
11762     }
11763     pending_inc.new_require_osd_release = rel;
11764     goto update;
11765   } else if (prefix == "osd down" ||
11766              prefix == "osd out" ||
11767              prefix == "osd in" ||
11768              prefix == "osd rm" ||
11769              prefix == "osd stop") {
11770
11771     bool any = false;
11772     bool stop = false;
11773     bool verbose = true;
11774     bool definitely_dead = false;
11775
11776     vector<string> idvec;
11777     cmd_getval(cmdmap, "ids", idvec);
11778     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11779     derr << "definitely_dead " << (int)definitely_dead << dendl;
11780     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11781       set<int> osds;
11782
11783       // wildcard?
11784       if (j == 0 &&
11785           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11786         if (prefix == "osd in") {
11787           // touch out osds only
11788           osdmap.get_out_existing_osds(osds);
11789         } else {
11790           osdmap.get_all_osds(osds);
11791         }
11792         stop = true;
11793         verbose = false; // so the output is less noisy.
11794       } else {
11795         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11796         if (osd < 0) {
11797           ss << "invalid osd id" << osd;
11798           err = -EINVAL;
11799           continue;
11800         } else if (!osdmap.exists(osd)) {
11801           ss << "osd." << osd << " does not exist. ";
11802           continue;
11803         }
11804
11805         osds.insert(osd);
11806       }
11807
11808       for (auto &osd : osds) {
11809         if (prefix == "osd down") {
11810           if (osdmap.is_down(osd)) {
11811             if (verbose)
11812               ss << "osd." << osd << " is already down. ";
11813           } else {
11814             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11815             ss << "marked down osd." << osd << ". ";
11816             any = true;
11817           }
11818           if (definitely_dead) {
11819             if (!pending_inc.new_xinfo.count(osd)) {
11820               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11821             }
11822             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11823               any = true;
11824             }
11825             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11826           }
11827         } else if (prefix == "osd out") {
11828           if (osdmap.is_out(osd)) {
11829             if (verbose)
11830               ss << "osd." << osd << " is already out. ";
11831           } else {
11832             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11833             if (osdmap.osd_weight[osd]) {
11834               if (pending_inc.new_xinfo.count(osd) == 0) {
11835                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11836               }
11837               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11838             }
11839             ss << "marked out osd." << osd << ". ";
11840             std::ostringstream msg;
11841             msg << "Client " << op->get_session()->entity_name
11842                 << " marked osd." << osd << " out";
11843             if (osdmap.is_up(osd)) {
11844               msg << ", while it was still marked up";
11845             } else {
11846               auto period = ceph_clock_now() - down_pending_out[osd];
11847               msg << ", after it was down for " << int(period.sec())
11848                   << " seconds";
11849             }
11850
11851             mon.clog->info() << msg.str();
11852             any = true;
11853           }
11854         } else if (prefix == "osd in") {
11855           if (osdmap.is_in(osd)) {
11856             if (verbose)
11857               ss << "osd." << osd << " is already in. ";
11858           } else {
11859             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11860               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11861               if (pending_inc.new_xinfo.count(osd) == 0) {
11862                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11863               }
11864               pending_inc.new_xinfo[osd].old_weight = 0;
11865             } else {
11866               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11867             }
11868             ss << "marked in osd." << osd << ". ";
11869             any = true;
11870           }
11871         } else if (prefix == "osd rm") {
11872           err = prepare_command_osd_remove(osd);
11873
11874           if (err == -EBUSY) {
11875             if (any)
11876               ss << ", ";
11877             ss << "osd." << osd << " is still up; must be down before removal. ";
11878           } else {
11879             ceph_assert(err == 0);
11880             if (any) {
11881               ss << ", osd." << osd;
11882             } else {
11883               ss << "removed osd." << osd;
11884             }
11885             any = true;
11886           }
11887         } else if (prefix == "osd stop") {
11888           if (osdmap.is_stop(osd)) {
11889             if (verbose)
11890               ss << "osd." << osd << " is already stopped. ";
11891           } else if (osdmap.is_down(osd)) {
11892             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11893             ss << "stop down osd." << osd << ". ";
11894             any = true;
11895           } else {
11896             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11897             ss << "stop osd." << osd << ". ";
11898             any = true;
11899           }
11900         }
11901       }
11902     }
11903     if (any) {
11904       getline(ss, rs);
11905       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11906                                                 get_last_committed() + 1));
11907       return true;
11908     }
11909   } else if (prefix == "osd set-group" ||
11910              prefix == "osd unset-group" ||
11911              prefix == "osd add-noup" ||
11912              prefix == "osd add-nodown" ||
11913              prefix == "osd add-noin" ||
11914              prefix == "osd add-noout" ||
11915              prefix == "osd rm-noup" ||
11916              prefix == "osd rm-nodown" ||
11917              prefix == "osd rm-noin" ||
11918              prefix == "osd rm-noout") {
11919     bool do_set = prefix == "osd set-group" ||
11920                   prefix.find("add") != string::npos;
11921     string flag_str;
11922     unsigned flags = 0;
11923     vector<string> who;
11924     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11925       cmd_getval(cmdmap, "flags", flag_str);
11926       cmd_getval(cmdmap, "who", who);
11927       vector<string> raw_flags;
11928       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11929       for (auto& f : raw_flags) {
11930         if (f == "noup")
11931           flags |= CEPH_OSD_NOUP;
11932         else if (f == "nodown")
11933           flags |= CEPH_OSD_NODOWN;
11934         else if (f == "noin")
11935           flags |= CEPH_OSD_NOIN;
11936         else if (f == "noout")
11937           flags |= CEPH_OSD_NOOUT;
11938         else {
11939           ss << "unrecognized flag '" << f << "', must be one of "
11940              << "{noup,nodown,noin,noout}";
11941           err = -EINVAL;
11942           goto reply_no_propose;
11943         }
11944       }
11945     } else {
11946       cmd_getval(cmdmap, "ids", who);
11947       if (prefix.find("noup") != string::npos)
11948         flags = CEPH_OSD_NOUP;
11949       else if (prefix.find("nodown") != string::npos)
11950         flags = CEPH_OSD_NODOWN;
11951       else if (prefix.find("noin") != string::npos)
11952         flags = CEPH_OSD_NOIN;
11953       else if (prefix.find("noout") != string::npos)
11954         flags = CEPH_OSD_NOOUT;
11955       else
11956         ceph_assert(0 == "Unreachable!");
11957     }
11958     if (flags == 0) {
11959       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11960       err = -EINVAL;
11961       goto reply_no_propose;
11962     }
11963     if (who.empty()) {
11964       ss << "must specify at least one or more targets to set/unset";
11965       err = -EINVAL;
11966       goto reply_no_propose;
11967     }
11968     set<int> osds;
11969     set<int> crush_nodes;
11970     set<int> device_classes;
11971     for (auto& w : who) {
11972       if (w == "any" || w == "all" || w == "*") {
11973         osdmap.get_all_osds(osds);
11974         break;
11975       }
11976       std::stringstream ts;
11977       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11978         osds.insert(osd);
11979       } else if (osdmap.crush->name_exists(w)) {
11980         crush_nodes.insert(osdmap.crush->get_item_id(w));
11981       } else if (osdmap.crush->class_exists(w)) {
11982         device_classes.insert(osdmap.crush->get_class_id(w));
11983       } else {
11984         ss << "unable to parse osd id or crush node or device class: "
11985            << "\"" << w << "\". ";
11986       }
11987     }
11988     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11989       // ss has reason for failure
11990       err = -EINVAL;
11991       goto reply_no_propose;
11992     }
11993     bool any = false;
11994     for (auto osd : osds) {
11995       if (!osdmap.exists(osd)) {
11996         ss << "osd." << osd << " does not exist. ";
11997         continue;
11998       }
11999       if (do_set) {
12000         if (flags & CEPH_OSD_NOUP) {
12001           any |= osdmap.is_noup_by_osd(osd) ?
12002             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
12003             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
12004         }
12005         if (flags & CEPH_OSD_NODOWN) {
12006           any |= osdmap.is_nodown_by_osd(osd) ?
12007             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
12008             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
12009         }
12010         if (flags & CEPH_OSD_NOIN) {
12011           any |= osdmap.is_noin_by_osd(osd) ?
12012             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
12013             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
12014         }
12015         if (flags & CEPH_OSD_NOOUT) {
12016           any |= osdmap.is_noout_by_osd(osd) ?
12017             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
12018             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
12019         }
12020       } else {
12021         if (flags & CEPH_OSD_NOUP) {
12022           any |= osdmap.is_noup_by_osd(osd) ?
12023             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
12024             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
12025         }
12026         if (flags & CEPH_OSD_NODOWN) {
12027           any |= osdmap.is_nodown_by_osd(osd) ?
12028             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
12029             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
12030         }
12031         if (flags & CEPH_OSD_NOIN) {
12032           any |= osdmap.is_noin_by_osd(osd) ?
12033             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
12034             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
12035         }
12036         if (flags & CEPH_OSD_NOOUT) {
12037           any |= osdmap.is_noout_by_osd(osd) ?
12038             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
12039             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
12040         }
12041       }
12042     }
12043     for (auto& id : crush_nodes) {
12044       auto old_flags = osdmap.get_crush_node_flags(id);
12045       auto& pending_flags = pending_inc.new_crush_node_flags[id];
12046       pending_flags |= old_flags; // adopt existing flags first!
12047       if (do_set) {
12048         pending_flags |= flags;
12049       } else {
12050         pending_flags &= ~flags;
12051       }
12052       any = true;
12053     }
12054     for (auto& id : device_classes) {
12055       auto old_flags = osdmap.get_device_class_flags(id);
12056       auto& pending_flags = pending_inc.new_device_class_flags[id];
12057       pending_flags |= old_flags;
12058       if (do_set) {
12059         pending_flags |= flags;
12060       } else {
12061         pending_flags &= ~flags;
12062       }
12063       any = true;
12064     }
12065     if (any) {
12066       getline(ss, rs);
12067       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
12068                                  get_last_committed() + 1));
12069       return true;
12070     }
12071   } else if (prefix == "osd pg-temp") {
12072     pg_t pgid;
12073     err = parse_pgid(cmdmap, ss, pgid);
12074     if (err < 0)
12075       goto reply_no_propose;
12076     if (pending_inc.new_pg_temp.count(pgid)) {
12077       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12078       goto wait;
12079     }
12080
12081     vector<int64_t> id_vec;
12082     vector<int32_t> new_pg_temp;
12083     cmd_getval(cmdmap, "id", id_vec);
12084     if (id_vec.empty())  {
12085       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12086       ss << "done cleaning up pg_temp of " << pgid;
12087       goto update;
12088     }
12089     for (auto osd : id_vec) {
12090       if (!osdmap.exists(osd)) {
12091         ss << "osd." << osd << " does not exist";
12092         err = -ENOENT;
12093         goto reply_no_propose;
12094       }
12095       new_pg_temp.push_back(osd);
12096     }
12097
12098     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12099     if ((int)new_pg_temp.size() < pool_min_size) {
12100       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12101          << pool_min_size << ")";
12102       err = -EINVAL;
12103       goto reply_no_propose;
12104     }
12105
12106     int pool_size = osdmap.get_pg_pool_size(pgid);
12107     if ((int)new_pg_temp.size() > pool_size) {
12108       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12109          << pool_size << ")";
12110       err = -EINVAL;
12111       goto reply_no_propose;
12112     }
12113
12114     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12115       new_pg_temp.begin(), new_pg_temp.end());
12116     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12117     goto update;
12118   } else if (prefix == "osd primary-temp" ||
12119              prefix == "osd rm-primary-temp") {
12120     pg_t pgid;
12121     err = parse_pgid(cmdmap, ss, pgid);
12122     if (err < 0)
12123       goto reply_no_propose;
12124
12125     int64_t osd;
12126     if (prefix == "osd primary-temp") {
12127       if (!cmd_getval(cmdmap, "id", osd)) {
12128         ss << "unable to parse 'id' value '"
12129            << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12130         err = -EINVAL;
12131         goto reply_no_propose;
12132       }
12133       if (!osdmap.exists(osd)) {
12134         ss << "osd." << osd << " does not exist";
12135         err = -ENOENT;
12136         goto reply_no_propose;
12137       }
12138     }
12139     else if (prefix == "osd rm-primary-temp") {
12140       osd = -1;
12141     }
12142     else {
12143       ceph_assert(0 == "Unreachable!");
12144     }
12145
12146     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12147         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12148       ss << "require_min_compat_client "
12149          << osdmap.require_min_compat_client
12150          << " < firefly, which is required for primary-temp";
12151       err = -EPERM;
12152       goto reply_no_propose;
12153     }
12154
12155     pending_inc.new_primary_temp[pgid] = osd;
12156     ss << "set " << pgid << " primary_temp mapping to " << osd;
12157     goto update;
12158   } else if (prefix == "pg repeer") {
12159     pg_t pgid;
12160     err = parse_pgid(cmdmap, ss, pgid);
12161     if (err < 0)
12162       goto reply_no_propose;
12163     vector<int> acting;
12164     int primary;
12165     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12166     if (primary < 0) {
12167       err = -EAGAIN;
12168       ss << "pg currently has no primary";
12169       goto reply_no_propose;
12170     }
12171     if (acting.size() > 1) {
12172       // map to just primary; it will map back to what it wants
12173       pending_inc.new_pg_temp[pgid] = { primary };
12174     } else {
12175       // hmm, pick another arbitrary osd to induce a change.  Note
12176       // that this won't work if there is only one suitable OSD in the cluster.
12177       int i;
12178       bool done = false;
12179       for (i = 0; i < osdmap.get_max_osd(); ++i) {
12180         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12181           continue;
12182         }
12183         pending_inc.new_pg_temp[pgid] = { primary, i };
12184         done = true;
12185         break;
12186       }
12187       if (!done) {
12188         err = -EAGAIN;
12189         ss << "not enough up OSDs in the cluster to force repeer";
12190         goto reply_no_propose;
12191       }
12192     }
12193     goto update;
12194   } else if (prefix == "osd pg-upmap" ||
12195              prefix == "osd rm-pg-upmap" ||
12196              prefix == "osd pg-upmap-items" ||
12197              prefix == "osd rm-pg-upmap-items" ||
12198              prefix == "osd pg-upmap-primary" ||
12199              prefix == "osd rm-pg-upmap-primary") {
12200     enum {
12201       OP_PG_UPMAP,
12202       OP_RM_PG_UPMAP,
12203       OP_PG_UPMAP_ITEMS,
12204       OP_RM_PG_UPMAP_ITEMS,
12205       OP_PG_UPMAP_PRIMARY,
12206       OP_RM_PG_UPMAP_PRIMARY,
12207     } upmap_option;
12208
12209     if (prefix == "osd pg-upmap") {
12210       upmap_option = OP_PG_UPMAP;
12211     } else if (prefix == "osd rm-pg-upmap") {
12212       upmap_option = OP_RM_PG_UPMAP;
12213     } else if (prefix == "osd pg-upmap-items") {
12214       upmap_option = OP_PG_UPMAP_ITEMS;
12215     } else if (prefix == "osd rm-pg-upmap-items") {
12216       upmap_option = OP_RM_PG_UPMAP_ITEMS;
12217     } else if (prefix == "osd pg-upmap-primary") {
12218       upmap_option = OP_PG_UPMAP_PRIMARY;
12219     } else if (prefix == "osd rm-pg-upmap-primary") {
12220       upmap_option = OP_RM_PG_UPMAP_PRIMARY;
12221     } else {
12222       ceph_abort_msg("invalid upmap option");
12223     }
12224
12225     ceph_release_t min_release = ceph_release_t::unknown;
12226     string feature_name = "unknown";
12227     switch (upmap_option) {
12228     case OP_PG_UPMAP:           // fall through
12229     case OP_RM_PG_UPMAP:        // fall through
12230     case OP_PG_UPMAP_ITEMS:     // fall through
12231     case OP_RM_PG_UPMAP_ITEMS:
12232       min_release = ceph_release_t::luminous;
12233       feature_name = "pg-upmap";
12234       break;
12235
12236     case OP_PG_UPMAP_PRIMARY:   // fall through
12237     case OP_RM_PG_UPMAP_PRIMARY:
12238       min_release = ceph_release_t::reef;
12239       feature_name = "pg-upmap-primary";
12240       break;
12241
12242     default:
12243       ceph_abort_msg("invalid upmap option");
12244     }
12245     uint64_t min_feature = CEPH_FEATUREMASK_OSDMAP_PG_UPMAP;
12246     string min_release_name = ceph_release_name(static_cast<int>(min_release));
12247
12248     if (osdmap.require_min_compat_client < min_release) {
12249       ss << "min_compat_client "
12250          << osdmap.require_min_compat_client
12251          << " < " << min_release_name << ", which is required for " << feature_name << ". "
12252          << "Try 'ceph osd set-require-min-compat-client " << min_release_name << "' "
12253          << "before using the new interface";
12254       err = -EPERM;
12255       goto reply_no_propose;
12256     }
12257
12258     //TODO: Should I add feature and test for upmap-primary?
12259     err = check_cluster_features(min_feature, ss);
12260     if (err == -EAGAIN)
12261       goto wait;
12262     if (err < 0)
12263       goto reply_no_propose;
12264     pg_t pgid;
12265     err = parse_pgid(cmdmap, ss, pgid);
12266     if (err < 0)
12267       goto reply_no_propose;
12268     if (pending_inc.old_pools.count(pgid.pool())) {
12269       ss << "pool of " << pgid << " is pending removal";
12270       err = -ENOENT;
12271       getline(ss, rs);
12272       wait_for_finished_proposal(op,
12273         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12274       return true;
12275     }
12276
12277     // check pending upmap changes
12278     switch (upmap_option) {
12279     case OP_PG_UPMAP: // fall through
12280     case OP_RM_PG_UPMAP:
12281       if (pending_inc.new_pg_upmap.count(pgid) ||
12282           pending_inc.old_pg_upmap.count(pgid)) {
12283         dout(10) << __func__ << " waiting for pending update on "
12284                  << pgid << dendl;
12285         goto wait;
12286       }
12287       break;
12288
12289     case OP_PG_UPMAP_PRIMARY:   // fall through
12290     case OP_RM_PG_UPMAP_PRIMARY:
12291       {
12292         const pg_pool_t *pt = osdmap.get_pg_pool(pgid.pool());
12293         if (! pt->is_replicated()) {
12294           ss << "pg-upmap-primary is only supported for replicated pools";
12295           err = -EINVAL;
12296           goto reply_no_propose;
12297         }
12298       }
12299       // fall through
12300     case OP_PG_UPMAP_ITEMS:     // fall through
12301     case OP_RM_PG_UPMAP_ITEMS:  // fall through
12302       if (pending_inc.new_pg_upmap_items.count(pgid) ||
12303           pending_inc.old_pg_upmap_items.count(pgid)) {
12304         dout(10) << __func__ << " waiting for pending update on "
12305                  << pgid << dendl;
12306         goto wait;
12307       }
12308       break;
12309
12310     default:
12311       ceph_abort_msg("invalid upmap option");
12312     }
12313
12314     switch (upmap_option) {
12315     case OP_PG_UPMAP:
12316       {
12317         vector<int64_t> id_vec;
12318         if (!cmd_getval(cmdmap, "id", id_vec)) {
12319           ss << "unable to parse 'id' value(s) '"
12320              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12321           err = -EINVAL;
12322           goto reply_no_propose;
12323         }
12324
12325         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12326         if ((int)id_vec.size() < pool_min_size) {
12327           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12328              << pool_min_size << ")";
12329           err = -EINVAL;
12330           goto reply_no_propose;
12331         }
12332
12333         int pool_size = osdmap.get_pg_pool_size(pgid);
12334         if ((int)id_vec.size() > pool_size) {
12335           ss << "num of osds (" << id_vec.size() <<") > pool size ("
12336              << pool_size << ")";
12337           err = -EINVAL;
12338           goto reply_no_propose;
12339         }
12340
12341         vector<int32_t> new_pg_upmap;
12342         for (auto osd : id_vec) {
12343           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12344             ss << "osd." << osd << " does not exist";
12345             err = -ENOENT;
12346             goto reply_no_propose;
12347           }
12348           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12349           if (it != new_pg_upmap.end()) {
12350             ss << "osd." << osd << " already exists, ";
12351             continue;
12352           }
12353           new_pg_upmap.push_back(osd);
12354         }
12355
12356         if (new_pg_upmap.empty()) {
12357           ss << "no valid upmap items(pairs) is specified";
12358           err = -EINVAL;
12359           goto reply_no_propose;
12360         }
12361
12362         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12363           new_pg_upmap.begin(), new_pg_upmap.end());
12364         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12365       }
12366       break;
12367
12368     case OP_RM_PG_UPMAP:
12369       {
12370         pending_inc.old_pg_upmap.insert(pgid);
12371         ss << "clear " << pgid << " pg_upmap mapping";
12372       }
12373       break;
12374
12375     case OP_PG_UPMAP_ITEMS:
12376       {
12377         vector<int64_t> id_vec;
12378         if (!cmd_getval(cmdmap, "id", id_vec)) {
12379           ss << "unable to parse 'id' value(s) '"
12380              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12381           err = -EINVAL;
12382           goto reply_no_propose;
12383         }
12384
12385         if (id_vec.size() % 2) {
12386           ss << "you must specify pairs of osd ids to be remapped";
12387           err = -EINVAL;
12388           goto reply_no_propose;
12389         }
12390
12391         int pool_size = osdmap.get_pg_pool_size(pgid);
12392         if ((int)(id_vec.size() / 2) > pool_size) {
12393           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12394              << pool_size << ")";
12395           err = -EINVAL;
12396           goto reply_no_propose;
12397         }
12398
12399         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12400         ostringstream items;
12401         items << "[";
12402         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12403           int from = *p++;
12404           int to = *p;
12405           if (from == to) {
12406             ss << "from osd." << from << " == to osd." << to << ", ";
12407             continue;
12408           }
12409           if (!osdmap.exists(from)) {
12410             ss << "osd." << from << " does not exist";
12411             err = -ENOENT;
12412             goto reply_no_propose;
12413           }
12414           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12415             ss << "osd." << to << " does not exist";
12416             err = -ENOENT;
12417             goto reply_no_propose;
12418           }
12419           pair<int32_t,int32_t> entry = make_pair(from, to);
12420           auto it = std::find(new_pg_upmap_items.begin(),
12421             new_pg_upmap_items.end(), entry);
12422           if (it != new_pg_upmap_items.end()) {
12423             ss << "osd." << from << " -> osd." << to << " already exists, ";
12424             continue;
12425           }
12426           new_pg_upmap_items.push_back(entry);
12427           items << from << "->" << to << ",";
12428         }
12429         string out(items.str());
12430         out.resize(out.size() - 1); // drop last ','
12431         out += "]";
12432
12433         if (new_pg_upmap_items.empty()) {
12434           ss << "no valid upmap items(pairs) is specified";
12435           err = -EINVAL;
12436           goto reply_no_propose;
12437         }
12438
12439         pending_inc.new_pg_upmap_items[pgid] =
12440           mempool::osdmap::vector<pair<int32_t,int32_t>>(
12441           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12442         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12443       }
12444       break;
12445
12446     case OP_RM_PG_UPMAP_ITEMS:
12447       {
12448         pending_inc.old_pg_upmap_items.insert(pgid);
12449         ss << "clear " << pgid << " pg_upmap_items mapping";
12450       }
12451       break;
12452
12453     case OP_PG_UPMAP_PRIMARY:
12454       {
12455         int64_t id;
12456         if (!cmd_getval(cmdmap, "id", id)) {
12457           ss << "invalid osd id value '"
12458              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12459           err = -EINVAL;
12460           goto reply_no_propose;
12461         }
12462         if (id != CRUSH_ITEM_NONE && !osdmap.exists(id)) {
12463           ss << "osd." << id << " does not exist";
12464           err = -ENOENT;
12465           goto reply_no_propose;
12466         }
12467         vector<int> acting;
12468         int primary;
12469         osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12470         if (id == primary) {
12471           ss << "osd." << id << " is already primary for pg " << pgid;
12472           err = -EINVAL;
12473           goto reply_no_propose;
12474         }
12475         int found_idx = 0;
12476         for (int i = 1 ; i < (int)acting.size(); i++) {  // skip 0 on purpose
12477           if (acting[i] == id) {
12478             found_idx = i;
12479             break;
12480           }
12481         }
12482         if (found_idx == 0) {
12483           ss << "osd." << id << " is not in acting set for pg " << pgid;
12484           err = -EINVAL;
12485           goto reply_no_propose;
12486         }
12487         vector<int> new_acting(acting);
12488         new_acting[found_idx] = new_acting[0];
12489         new_acting[0] = id;
12490         int pool_size = osdmap.get_pg_pool_size(pgid);
12491         if (osdmap.crush->verify_upmap(cct, osdmap.get_pg_pool_crush_rule(pgid),
12492             pool_size, new_acting) >= 0) {
12493           ss << "change primary for pg " << pgid << " to osd." << id;
12494         }
12495         else {
12496           ss << "can't change primary for pg " << pgid << " to osd." << id
12497              << " - illegal pg after the change";
12498           err = -EINVAL;
12499           goto reply_no_propose;
12500         }
12501         pending_inc.new_pg_upmap_primary[pgid] = id;
12502         //TO-REMOVE:
12503         ldout(cct, 20) << "pg " << pgid << ": set pg_upmap_primary to " << id << dendl;
12504       }
12505       break;
12506
12507     case OP_RM_PG_UPMAP_PRIMARY:
12508       {
12509         pending_inc.old_pg_upmap_primary.insert(pgid);
12510         ss << "clear " << pgid << " pg_upmap_primary mapping";
12511       }
12512       break;
12513
12514     default:
12515       ceph_abort_msg("invalid upmap option");
12516     }
12517
12518     goto update;
12519   } else if (prefix == "osd primary-affinity") {
12520     int64_t id;
12521     if (!cmd_getval(cmdmap, "id", id)) {
12522       ss << "invalid osd id value '"
12523          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12524       err = -EINVAL;
12525       goto reply_no_propose;
12526     }
12527     double w;
12528     if (!cmd_getval(cmdmap, "weight", w)) {
12529       ss << "unable to parse 'weight' value '"
12530          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12531       err = -EINVAL;
12532       goto reply_no_propose;
12533     }
12534     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12535     if (ww < 0L) {
12536       ss << "weight must be >= 0";
12537       err = -EINVAL;
12538       goto reply_no_propose;
12539     }
12540     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12541         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12542       ss << "require_min_compat_client "
12543          << osdmap.require_min_compat_client
12544          << " < firefly, which is required for primary-affinity";
12545       err = -EPERM;
12546       goto reply_no_propose;
12547     }
12548     if (osdmap.exists(id)) {
12549       pending_inc.new_primary_affinity[id] = ww;
12550       ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12551       getline(ss, rs);
12552       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12553                                                 get_last_committed() + 1));
12554       return true;
12555     } else {
12556       ss << "osd." << id << " does not exist";
12557       err = -ENOENT;
12558       goto reply_no_propose;
12559     }
12560   } else if (prefix == "osd reweight") {
12561     int64_t id;
12562     if (!cmd_getval(cmdmap, "id", id)) {
12563       ss << "unable to parse osd id value '"
12564          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12565       err = -EINVAL;
12566       goto reply_no_propose;
12567     }
12568     double w;
12569     if (!cmd_getval(cmdmap, "weight", w)) {
12570       ss << "unable to parse weight value '"
12571          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12572       err = -EINVAL;
12573       goto reply_no_propose;
12574     }
12575     long ww = (int)((double)CEPH_OSD_IN*w);
12576     if (ww < 0L) {
12577       ss << "weight must be >= 0";
12578       err = -EINVAL;
12579       goto reply_no_propose;
12580     }
12581     if (osdmap.exists(id)) {
12582       pending_inc.new_weight[id] = ww;
12583       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12584       getline(ss, rs);
12585       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12586                                                 get_last_committed() + 1));
12587       return true;
12588     } else {
12589       ss << "osd." << id << " does not exist";
12590       err = -ENOENT;
12591       goto reply_no_propose;
12592     }
12593   } else if (prefix == "osd reweightn") {
12594     map<int32_t, uint32_t> weights;
12595     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12596     if (err) {
12597       ss << "unable to parse 'weights' value '"
12598          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12599       goto reply_no_propose;
12600     }
12601     pending_inc.new_weight.insert(weights.begin(), weights.end());
12602     wait_for_finished_proposal(
12603         op,
12604         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12605     return true;
12606   } else if (prefix == "osd lost") {
12607     int64_t id;
12608     if (!cmd_getval(cmdmap, "id", id)) {
12609       ss << "unable to parse osd id value '"
12610          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12611       err = -EINVAL;
12612       goto reply_no_propose;
12613     }
12614     bool sure = false;
12615     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12616     if (!sure) {
12617       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12618             "--yes-i-really-mean-it if you really do.";
12619       err = -EPERM;
12620       goto reply_no_propose;
12621     } else if (!osdmap.exists(id)) {
12622       ss << "osd." << id << " does not exist";
12623       err = -ENOENT;
12624       goto reply_no_propose;
12625     } else if (!osdmap.is_down(id)) {
12626       ss << "osd." << id << " is not down";
12627       err = -EBUSY;
12628       goto reply_no_propose;
12629     } else {
12630       epoch_t e = osdmap.get_info(id).down_at;
12631       pending_inc.new_lost[id] = e;
12632       ss << "marked osd lost in epoch " << e;
12633       getline(ss, rs);
12634       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12635                                                 get_last_committed() + 1));
12636       return true;
12637     }
12638
12639   } else if (prefix == "osd destroy-actual" ||
12640              prefix == "osd purge-actual" ||
12641              prefix == "osd purge-new") {
12642     /* Destroying an OSD means that we don't expect to further make use of
12643      * the OSDs data (which may even become unreadable after this operation),
12644      * and that we are okay with scrubbing all its cephx keys and config-key
12645      * data (which may include lockbox keys, thus rendering the osd's data
12646      * unreadable).
12647      *
12648      * The OSD will not be removed. Instead, we will mark it as destroyed,
12649      * such that a subsequent call to `create` will not reuse the osd id.
12650      * This will play into being able to recreate the OSD, at the same
12651      * crush location, with minimal data movement.
12652      */
12653
12654     // make sure authmon is writeable.
12655     if (!mon.authmon()->is_writeable()) {
12656       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12657                << "osd destroy" << dendl;
12658       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12659       return false;
12660     }
12661
12662     int64_t id;
12663     if (!cmd_getval(cmdmap, "id", id)) {
12664       auto p = cmdmap.find("id");
12665       if (p == cmdmap.end()) {
12666         ss << "no osd id specified";
12667       } else {
12668         ss << "unable to parse osd id value '"
12669            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12670       }
12671       err = -EINVAL;
12672       goto reply_no_propose;
12673     }
12674
12675     bool is_destroy = (prefix == "osd destroy-actual");
12676     if (!is_destroy) {
12677       ceph_assert("osd purge-actual" == prefix ||
12678              "osd purge-new" == prefix);
12679     }
12680
12681     bool sure = false;
12682     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12683     if (!sure) {
12684       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12685          << "This will mean real, permanent data loss, as well "
12686          << "as deletion of cephx and lockbox keys. "
12687          << "Pass --yes-i-really-mean-it if you really do.";
12688       err = -EPERM;
12689       goto reply_no_propose;
12690     } else if (!osdmap.exists(id)) {
12691       ss << "osd." << id << " does not exist";
12692       err = 0; // idempotent
12693       goto reply_no_propose;
12694     } else if (osdmap.is_up(id)) {
12695       ss << "osd." << id << " is not `down`.";
12696       err = -EBUSY;
12697       goto reply_no_propose;
12698     } else if (is_destroy && osdmap.is_destroyed(id)) {
12699       ss << "destroyed osd." << id;
12700       err = 0;
12701       goto reply_no_propose;
12702     }
12703
12704     if (prefix == "osd purge-new" &&
12705         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12706       ss << "osd." << id << " is not new";
12707       err = -EPERM;
12708       goto reply_no_propose;
12709     }
12710
12711     bool goto_reply = false;
12712
12713     paxos.plug();
12714     if (is_destroy) {
12715       err = prepare_command_osd_destroy(id, ss);
12716       // we checked above that it should exist.
12717       ceph_assert(err != -ENOENT);
12718     } else {
12719       err = prepare_command_osd_purge(id, ss);
12720       if (err == -ENOENT) {
12721         err = 0;
12722         ss << "osd." << id << " does not exist.";
12723         goto_reply = true;
12724       }
12725     }
12726     paxos.unplug();
12727
12728     if (err < 0 || goto_reply) {
12729       goto reply_no_propose;
12730     }
12731
12732     if (is_destroy) {
12733       ss << "destroyed osd." << id;
12734     } else {
12735       ss << "purged osd." << id;
12736     }
12737
12738     getline(ss, rs);
12739     wait_for_finished_proposal(op,
12740         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12741     force_immediate_propose();
12742     return true;
12743
12744   } else if (prefix == "osd new") {
12745
12746     // make sure authmon is writeable.
12747     if (!mon.authmon()->is_writeable()) {
12748       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12749                << "osd new" << dendl;
12750       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12751       return false;
12752     }
12753
12754     // make sure kvmon is writeable.
12755     if (!mon.kvmon()->is_writeable()) {
12756       dout(10) << __func__ << " waiting for kv mon to be writeable for "
12757                << "osd new" << dendl;
12758       mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12759       return false;
12760     }
12761
12762     map<string,string> param_map;
12763
12764     bufferlist bl = m->get_data();
12765     string param_json = bl.to_str();
12766     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12767
12768     err = get_json_str_map(param_json, ss, &param_map);
12769     if (err < 0)
12770       goto reply_no_propose;
12771
12772     dout(20) << __func__ << " osd new params " << param_map << dendl;
12773
12774     paxos.plug();
12775     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12776     paxos.unplug();
12777
12778     if (err < 0) {
12779       goto reply_no_propose;
12780     }
12781
12782     if (f) {
12783       f->flush(rdata);
12784     } else {
12785       rdata.append(ss);
12786     }
12787
12788     if (err == EEXIST) {
12789       // idempotent operation
12790       err = 0;
12791       goto reply_no_propose;
12792     }
12793
12794     wait_for_finished_proposal(op,
12795         new Monitor::C_Command(mon, op, 0, rs, rdata,
12796                                get_last_committed() + 1));
12797     force_immediate_propose();
12798     return true;
12799
12800   } else if (prefix == "osd create") {
12801
12802     // optional id provided?
12803     int64_t id = -1, cmd_id = -1;
12804     if (cmd_getval(cmdmap, "id", cmd_id)) {
12805       if (cmd_id < 0) {
12806         ss << "invalid osd id value '" << cmd_id << "'";
12807         err = -EINVAL;
12808         goto reply_no_propose;
12809       }
12810       dout(10) << " osd create got id " << cmd_id << dendl;
12811     }
12812
12813     uuid_d uuid;
12814     string uuidstr;
12815     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12816       if (!uuid.parse(uuidstr.c_str())) {
12817         ss << "invalid uuid value '" << uuidstr << "'";
12818         err = -EINVAL;
12819         goto reply_no_propose;
12820       }
12821       // we only care about the id if we also have the uuid, to
12822       // ensure the operation's idempotency.
12823       id = cmd_id;
12824     }
12825
12826     int32_t new_id = -1;
12827     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12828     if (err < 0) {
12829       if (err == -EAGAIN) {
12830         goto wait;
12831       }
12832       // a check has failed; reply to the user.
12833       goto reply_no_propose;
12834
12835     } else if (err == EEXIST) {
12836       // this is an idempotent operation; we can go ahead and reply.
12837       if (f) {
12838         f->open_object_section("created_osd");
12839         f->dump_int("osdid", new_id);
12840         f->close_section();
12841         f->flush(rdata);
12842       } else {
12843         ss << new_id;
12844         rdata.append(ss);
12845       }
12846       err = 0;
12847       goto reply_no_propose;
12848     }
12849
12850     string empty_device_class;
12851     do_osd_create(id, uuid, empty_device_class, &new_id);
12852
12853     if (f) {
12854       f->open_object_section("created_osd");
12855       f->dump_int("osdid", new_id);
12856       f->close_section();
12857       f->flush(rdata);
12858     } else {
12859       ss << new_id;
12860       rdata.append(ss);
12861     }
12862     wait_for_finished_proposal(op,
12863         new Monitor::C_Command(mon, op, 0, rs, rdata,
12864                                get_last_committed() + 1));
12865     return true;
12866
12867   } else if (prefix == "osd blocklist clear" ||
12868              prefix == "osd blacklist clear") {
12869     pending_inc.new_blocklist.clear();
12870     std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12871     std::list<std::pair<entity_addr_t,utime_t > > range_b;
12872     osdmap.get_blocklist(&blocklist, &range_b);
12873     for (const auto &entry : blocklist) {
12874       pending_inc.old_blocklist.push_back(entry.first);
12875     }
12876     for (const auto &entry : range_b) {
12877       pending_inc.old_range_blocklist.push_back(entry.first);
12878     }
12879     ss << " removed all blocklist entries";
12880     getline(ss, rs);
12881     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12882                                               get_last_committed() + 1));
12883     return true;
12884   } else if (prefix == "osd blocklist" ||
12885              prefix == "osd blacklist") {
12886     string addrstr, rangestr;
12887     bool range = false;
12888     cmd_getval(cmdmap, "addr", addrstr);
12889     if (cmd_getval(cmdmap, "range", rangestr)) {
12890       if (rangestr == "range") {
12891         range = true;
12892       } else {
12893         ss << "Did you mean to specify \"osd blocklist range\"?";
12894         err = -EINVAL;
12895         goto reply_no_propose;
12896       }
12897     }
12898     entity_addr_t addr;
12899     if (!addr.parse(addrstr)) {
12900       ss << "unable to parse address " << addrstr;
12901       err = -EINVAL;
12902       goto reply_no_propose;
12903     }
12904     else {
12905       if (range) {
12906         if (!addr.maybe_cidr()) {
12907           ss << "You specified a range command, but " << addr
12908              << " does not parse as a CIDR range";
12909           err = -EINVAL;
12910           goto reply_no_propose;
12911         }
12912         addr.type = entity_addr_t::TYPE_CIDR;
12913         err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12914         if (err) {
12915           goto reply_no_propose;
12916         }
12917         if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12918             (addr.is_ipv6() && addr.get_nonce() > 128)) {
12919           ss << "Too many bits in range for that protocol!";
12920           err = -EINVAL;
12921           goto reply_no_propose;
12922         }
12923       } else {
12924         if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12925           // always blocklist type ANY
12926           addr.set_type(entity_addr_t::TYPE_ANY);
12927         } else {
12928           addr.set_type(entity_addr_t::TYPE_LEGACY);
12929         }
12930       }
12931
12932       string blocklistop;
12933       if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12934         cmd_getval(cmdmap, "blacklistop", blocklistop);
12935       }
12936       if (blocklistop == "add") {
12937         utime_t expires = ceph_clock_now();
12938         // default one hour
12939         double d = cmd_getval_or<double>(cmdmap, "expire",
12940           g_conf()->mon_osd_blocklist_default_expire);
12941         expires += d;
12942
12943         auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12944                                             const auto& addr,
12945                                             const auto& expires) {
12946           nb[addr] = expires;
12947           // cancel any pending un-blocklisting request too
12948           auto it = std::find(ob.begin(),
12949                               ob.end(), addr);
12950           if (it != ob.end()) {
12951             ob.erase(it);
12952           }
12953         };
12954         if (range) {
12955           add_to_pending_blocklists(pending_inc.new_range_blocklist,
12956                                     pending_inc.old_range_blocklist,
12957                                     addr, expires);
12958
12959         } else {
12960           add_to_pending_blocklists(pending_inc.new_blocklist,
12961                                     pending_inc.old_blocklist,
12962                                     addr, expires);
12963         }
12964
12965         ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12966         getline(ss, rs);
12967         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12968                                                   get_last_committed() + 1));
12969         return true;
12970       } else if (blocklistop == "rm") {
12971         auto rm_from_pending_blocklists = [](const auto& addr,
12972                                              auto& blocklist,
12973                                              auto& ob, auto& pb) {
12974           if (blocklist.count(addr)) {
12975             ob.push_back(addr);
12976             return true;
12977           } else if (pb.count(addr)) {
12978             pb.erase(addr);
12979             return true;
12980           }
12981           return false;
12982         };
12983         if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12984                                                   pending_inc.old_blocklist,
12985                                                   pending_inc.new_blocklist)) ||
12986             (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12987                                                  pending_inc.old_range_blocklist,
12988                                                  pending_inc.new_range_blocklist))) {
12989           ss << "un-blocklisting " << addr;
12990           getline(ss, rs);
12991           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12992                                                     get_last_committed() + 1));
12993           return true;
12994         }
12995         ss << addr << " isn't blocklisted";
12996         err = 0;
12997         goto reply_no_propose;
12998       }
12999     }
13000   } else if (prefix == "osd pool mksnap") {
13001     string poolstr;
13002     cmd_getval(cmdmap, "pool", poolstr);
13003     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13004     if (pool < 0) {
13005       ss << "unrecognized pool '" << poolstr << "'";
13006       err = -ENOENT;
13007       goto reply_no_propose;
13008     }
13009     string snapname;
13010     cmd_getval(cmdmap, "snap", snapname);
13011     const pg_pool_t *p = osdmap.get_pg_pool(pool);
13012     if (p->is_unmanaged_snaps_mode()) {
13013       ss << "pool " << poolstr << " is in unmanaged snaps mode";
13014       err = -EINVAL;
13015       goto reply_no_propose;
13016     } else if (p->snap_exists(snapname.c_str())) {
13017       ss << "pool " << poolstr << " snap " << snapname << " already exists";
13018       err = 0;
13019       goto reply_no_propose;
13020     } else if (p->is_tier()) {
13021       ss << "pool " << poolstr << " is a cache tier";
13022       err = -EINVAL;
13023       goto reply_no_propose;
13024     }
13025     pg_pool_t *pp = 0;
13026     if (pending_inc.new_pools.count(pool))
13027       pp = &pending_inc.new_pools[pool];
13028     if (!pp) {
13029       pp = &pending_inc.new_pools[pool];
13030       *pp = *p;
13031     }
13032     if (pp->snap_exists(snapname.c_str())) {
13033       ss << "pool " << poolstr << " snap " << snapname << " already exists";
13034     } else {
13035       if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(pool)) {
13036         dout(20) << "pool-level snapshots have been disabled for pools "
13037                     "attached to an fs - poolid:" << pool << dendl;
13038         err = -EOPNOTSUPP;
13039         goto reply_no_propose;
13040       }
13041       pp->add_snap(snapname.c_str(), ceph_clock_now());
13042       pp->set_snap_epoch(pending_inc.epoch);
13043       ss << "created pool " << poolstr << " snap " << snapname;
13044     }
13045     getline(ss, rs);
13046     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13047                                               get_last_committed() + 1));
13048     return true;
13049   } else if (prefix == "osd pool rmsnap") {
13050     string poolstr;
13051     cmd_getval(cmdmap, "pool", poolstr);
13052     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13053     if (pool < 0) {
13054       ss << "unrecognized pool '" << poolstr << "'";
13055       err = -ENOENT;
13056       goto reply_no_propose;
13057     }
13058     string snapname;
13059     cmd_getval(cmdmap, "snap", snapname);
13060     const pg_pool_t *p = osdmap.get_pg_pool(pool);
13061     if (p->is_unmanaged_snaps_mode()) {
13062       ss << "pool " << poolstr << " is in unmanaged snaps mode";
13063       err = -EINVAL;
13064       goto reply_no_propose;
13065     } else if (!p->snap_exists(snapname.c_str())) {
13066       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
13067       err = 0;
13068       goto reply_no_propose;
13069     }
13070     pg_pool_t *pp = 0;
13071     if (pending_inc.new_pools.count(pool))
13072       pp = &pending_inc.new_pools[pool];
13073     if (!pp) {
13074       pp = &pending_inc.new_pools[pool];
13075       *pp = *p;
13076     }
13077     snapid_t sn = pp->snap_exists(snapname.c_str());
13078     if (sn) {
13079       pp->remove_snap(sn);
13080       pp->set_snap_epoch(pending_inc.epoch);
13081       ss << "removed pool " << poolstr << " snap " << snapname;
13082     } else {
13083       ss << "already removed pool " << poolstr << " snap " << snapname;
13084     }
13085     getline(ss, rs);
13086     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13087                                               get_last_committed() + 1));
13088     return true;
13089   } else if (prefix == "osd pool create") {
13090     int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
13091     int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
13092     int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
13093     int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
13094     string pool_type_str;
13095     cmd_getval(cmdmap, "pool_type", pool_type_str);
13096     if (pool_type_str.empty())
13097       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
13098
13099     string poolstr;
13100     cmd_getval(cmdmap, "pool", poolstr);
13101     bool confirm = false;
13102     //confirmation may be set to true only by internal operations.
13103     cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13104     if (poolstr[0] == '.' && !confirm) {
13105       ss << "pool names beginning with . are not allowed";
13106       err = 0;
13107       goto reply_no_propose;
13108     }
13109     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13110     if (pool_id >= 0) {
13111       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13112       if (pool_type_str != p->get_type_name()) {
13113         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
13114         err = -EINVAL;
13115       } else {
13116         ss << "pool '" << poolstr << "' already exists";
13117         err = 0;
13118       }
13119       goto reply_no_propose;
13120     }
13121
13122     int pool_type;
13123     if (pool_type_str == "replicated") {
13124       pool_type = pg_pool_t::TYPE_REPLICATED;
13125     } else if (pool_type_str == "erasure") {
13126       pool_type = pg_pool_t::TYPE_ERASURE;
13127     } else {
13128       ss << "unknown pool type '" << pool_type_str << "'";
13129       err = -EINVAL;
13130       goto reply_no_propose;
13131     }
13132
13133     bool implicit_rule_creation = false;
13134     int64_t expected_num_objects = 0;
13135     string rule_name;
13136     cmd_getval(cmdmap, "rule", rule_name);
13137     string erasure_code_profile;
13138     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
13139
13140     if (pool_type == pg_pool_t::TYPE_ERASURE) {
13141       if (erasure_code_profile == "")
13142         erasure_code_profile = "default";
13143       //handle the erasure code profile
13144       if (erasure_code_profile == "default") {
13145         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
13146           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
13147             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
13148             goto wait;
13149           }
13150
13151           map<string,string> profile_map;
13152           err = osdmap.get_erasure_code_profile_default(cct,
13153                                                       profile_map,
13154                                                       &ss);
13155           if (err)
13156             goto reply_no_propose;
13157           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
13158           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
13159           goto wait;
13160         }
13161       }
13162       if (rule_name == "") {
13163         implicit_rule_creation = true;
13164         if (erasure_code_profile == "default") {
13165           rule_name = "erasure-code";
13166         } else {
13167           dout(1) << "implicitly use rule named after the pool: "
13168                 << poolstr << dendl;
13169           rule_name = poolstr;
13170         }
13171       }
13172       expected_num_objects =
13173         cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13174     } else {
13175       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13176       //     and put expected_num_objects to rule field
13177       if (erasure_code_profile != "") { // cmd is from CLI
13178         if (rule_name != "") {
13179           string interr;
13180           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13181           if (interr.length()) {
13182             ss << "error parsing integer value '" << rule_name << "': " << interr;
13183             err = -EINVAL;
13184             goto reply_no_propose;
13185           }
13186         }
13187         rule_name = erasure_code_profile;
13188       } else { // cmd is well-formed
13189         expected_num_objects =
13190           cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13191       }
13192     }
13193
13194     if (!implicit_rule_creation && rule_name != "") {
13195       int rule;
13196       err = get_crush_rule(rule_name, &rule, &ss);
13197       if (err == -EAGAIN) {
13198         goto wait;
13199       }
13200       if (err)
13201         goto reply_no_propose;
13202     }
13203
13204     if (expected_num_objects < 0) {
13205       ss << "'expected_num_objects' must be non-negative";
13206       err = -EINVAL;
13207       goto reply_no_propose;
13208     }
13209
13210     set<int32_t> osds;
13211     osdmap.get_all_osds(osds);
13212     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13213       string type;
13214       if (!get_osd_objectstore_type(osd, &type)) {
13215         return type == "filestore";
13216       } else {
13217         return false;
13218       }
13219     });
13220
13221     if (has_filestore_osd &&
13222         expected_num_objects > 0 &&
13223         cct->_conf->filestore_merge_threshold > 0) {
13224       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13225       err = -EINVAL;
13226       goto reply_no_propose;
13227     }
13228
13229     if (has_filestore_osd &&
13230         expected_num_objects == 0 &&
13231         cct->_conf->filestore_merge_threshold < 0) {
13232       int osds = osdmap.get_num_osds();
13233       bool sure = false;
13234       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13235       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13236         ss << "For better initial performance on pools expected to store a "
13237            << "large number of objects, consider supplying the "
13238            << "expected_num_objects parameter when creating the pool."
13239            << " Pass --yes-i-really-mean-it to ignore it";
13240         err = -EPERM;
13241         goto reply_no_propose;
13242       }
13243     }
13244
13245     int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13246     FastReadType fast_read = FAST_READ_DEFAULT;
13247     if (fast_read_param == 0)
13248       fast_read = FAST_READ_OFF;
13249     else if (fast_read_param > 0)
13250       fast_read = FAST_READ_ON;
13251
13252     int64_t repl_size = 0;
13253     cmd_getval(cmdmap, "size", repl_size);
13254     int64_t target_size_bytes = 0;
13255     double target_size_ratio = 0.0;
13256     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13257     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13258
13259     string pg_autoscale_mode;
13260     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13261
13262     bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13263
13264     bool crimson = cmd_getval_or<bool>(cmdmap, "crimson", false) ||
13265       cct->_conf.get_val<bool>("osd_pool_default_crimson");
13266
13267     err = prepare_new_pool(poolstr,
13268                            -1, // default crush rule
13269                            rule_name,
13270                            pg_num, pgp_num, pg_num_min, pg_num_max,
13271                            repl_size, target_size_bytes, target_size_ratio,
13272                            erasure_code_profile, pool_type,
13273                            (uint64_t)expected_num_objects,
13274                            fast_read,
13275                            pg_autoscale_mode,
13276                            bulk,
13277                            crimson,
13278                            &ss);
13279     if (err < 0) {
13280       switch(err) {
13281       case -EEXIST:
13282         ss << "pool '" << poolstr << "' already exists";
13283         err = 0;
13284         goto reply_no_propose;
13285       case -EAGAIN:
13286         goto wait;
13287       case -ERANGE:
13288         goto reply_no_propose;
13289       default:
13290         goto reply_no_propose;
13291       }
13292     } else {
13293       ss << "pool '" << poolstr << "' created";
13294     }
13295     getline(ss, rs);
13296     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13297                                               get_last_committed() + 1));
13298     return true;
13299
13300   } else if (prefix == "osd pool delete" ||
13301              prefix == "osd pool rm") {
13302     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13303     string poolstr, poolstr2, sure;
13304     cmd_getval(cmdmap, "pool", poolstr);
13305     cmd_getval(cmdmap, "pool2", poolstr2);
13306     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13307     if (pool < 0) {
13308       ss << "pool '" << poolstr << "' does not exist";
13309       err = 0;
13310       goto reply_no_propose;
13311     }
13312
13313     bool force_no_fake = false;
13314     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13315     bool force = false;
13316     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13317     if (poolstr2 != poolstr ||
13318         (!force && !force_no_fake)) {
13319       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13320          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13321          << "followed by --yes-i-really-really-mean-it.";
13322       err = -EPERM;
13323       goto reply_no_propose;
13324     }
13325     err = _prepare_remove_pool(pool, &ss, force_no_fake);
13326     if (err == -EAGAIN) {
13327       goto wait;
13328     }
13329     if (err < 0)
13330       goto reply_no_propose;
13331     goto update;
13332   } else if (prefix == "osd pool rename") {
13333     string srcpoolstr, destpoolstr;
13334     cmd_getval(cmdmap, "srcpool", srcpoolstr);
13335     cmd_getval(cmdmap, "destpool", destpoolstr);
13336     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13337     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13338     bool confirm = false;
13339     //confirmation may be set to true only by internal operations.
13340     cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
13341     if (destpoolstr[0] == '.' && !confirm) {
13342       ss << "pool names beginning with . are not allowed";
13343       err = 0;
13344       goto reply_no_propose;
13345     }
13346     if (pool_src < 0) {
13347       if (pool_dst >= 0) {
13348         // src pool doesn't exist, dst pool does exist: to ensure idempotency
13349         // of operations, assume this rename succeeded, as it is not changing
13350         // the current state.  Make sure we output something understandable
13351         // for whoever is issuing the command, if they are paying attention,
13352         // in case it was not intentional; or to avoid a "wtf?" and a bug
13353         // report in case it was intentional, while expecting a failure.
13354         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13355           << destpoolstr << "' does -- assuming successful rename";
13356         err = 0;
13357       } else {
13358         ss << "unrecognized pool '" << srcpoolstr << "'";
13359         err = -ENOENT;
13360       }
13361       goto reply_no_propose;
13362     } else if (pool_dst >= 0) {
13363       // source pool exists and so does the destination pool
13364       ss << "pool '" << destpoolstr << "' already exists";
13365       err = -EEXIST;
13366       goto reply_no_propose;
13367     }
13368
13369     int ret = _prepare_rename_pool(pool_src, destpoolstr);
13370     if (ret == 0) {
13371       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13372     } else {
13373       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13374         << cpp_strerror(ret);
13375     }
13376     getline(ss, rs);
13377     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13378                                               get_last_committed() + 1));
13379     return true;
13380
13381   } else if (prefix == "osd pool set") {
13382     err = prepare_command_pool_set(cmdmap, ss);
13383     if (err == -EAGAIN)
13384       goto wait;
13385     if (err < 0)
13386       goto reply_no_propose;
13387
13388     getline(ss, rs);
13389     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13390                                                    get_last_committed() + 1));
13391     return true;
13392   } else if (prefix == "osd tier add") {
13393     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13394     if (err == -EAGAIN)
13395       goto wait;
13396     if (err)
13397       goto reply_no_propose;
13398     string poolstr;
13399     cmd_getval(cmdmap, "pool", poolstr);
13400     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13401     if (pool_id < 0) {
13402       ss << "unrecognized pool '" << poolstr << "'";
13403       err = -ENOENT;
13404       goto reply_no_propose;
13405     }
13406     string tierpoolstr;
13407     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13408     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13409     if (tierpool_id < 0) {
13410       ss << "unrecognized pool '" << tierpoolstr << "'";
13411       err = -ENOENT;
13412       goto reply_no_propose;
13413     }
13414     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13415     ceph_assert(p);
13416     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13417     ceph_assert(tp);
13418
13419     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13420       goto reply_no_propose;
13421     }
13422
13423     // make sure new tier is empty
13424     bool force_nonempty = false;
13425     cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13426     const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13427     if (pstats && pstats->stats.sum.num_objects != 0 &&
13428         !force_nonempty) {
13429       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13430       err = -ENOTEMPTY;
13431       goto reply_no_propose;
13432     }
13433     if (tp->is_erasure()) {
13434       ss << "tier pool '" << tierpoolstr
13435          << "' is an ec pool, which cannot be a tier";
13436       err = -ENOTSUP;
13437       goto reply_no_propose;
13438     }
13439     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13440         (!force_nonempty ||
13441          !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13442       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13443       err = -ENOTEMPTY;
13444       goto reply_no_propose;
13445     }
13446     // go
13447     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13448     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13449     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13450       goto wait;
13451     }
13452     np->tiers.insert(tierpool_id);
13453     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13454     ntp->tier_of = pool_id;
13455     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13456     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13457                                               get_last_committed() + 1));
13458     return true;
13459   } else if (prefix == "osd tier remove" ||
13460              prefix == "osd tier rm") {
13461     string poolstr;
13462     cmd_getval(cmdmap, "pool", poolstr);
13463     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13464     if (pool_id < 0) {
13465       ss << "unrecognized pool '" << poolstr << "'";
13466       err = -ENOENT;
13467       goto reply_no_propose;
13468     }
13469     string tierpoolstr;
13470     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13471     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13472     if (tierpool_id < 0) {
13473       ss << "unrecognized pool '" << tierpoolstr << "'";
13474       err = -ENOENT;
13475       goto reply_no_propose;
13476     }
13477     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13478     ceph_assert(p);
13479     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13480     ceph_assert(tp);
13481
13482     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13483       goto reply_no_propose;
13484     }
13485
13486     if (p->tiers.count(tierpool_id) == 0) {
13487       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13488       err = 0;
13489       goto reply_no_propose;
13490     }
13491     if (tp->tier_of != pool_id) {
13492       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13493          << osdmap.get_pool_name(tp->tier_of) << "': "
13494          // be scary about it; this is an inconsistency and bells must go off
13495          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13496       err = -EINVAL;
13497       goto reply_no_propose;
13498     }
13499     if (p->read_tier == tierpool_id) {
13500       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13501       err = -EBUSY;
13502       goto reply_no_propose;
13503     }
13504     // go
13505     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13506     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13507     if (np->tiers.count(tierpool_id) == 0 ||
13508         ntp->tier_of != pool_id ||
13509         np->read_tier == tierpool_id) {
13510       goto wait;
13511     }
13512     np->tiers.erase(tierpool_id);
13513     ntp->clear_tier();
13514     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13515     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13516                                               get_last_committed() + 1));
13517     return true;
13518   } else if (prefix == "osd tier set-overlay") {
13519     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13520     if (err == -EAGAIN)
13521       goto wait;
13522     if (err)
13523       goto reply_no_propose;
13524     string poolstr;
13525     cmd_getval(cmdmap, "pool", poolstr);
13526     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13527     if (pool_id < 0) {
13528       ss << "unrecognized pool '" << poolstr << "'";
13529       err = -ENOENT;
13530       goto reply_no_propose;
13531     }
13532     string overlaypoolstr;
13533     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13534     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13535     if (overlaypool_id < 0) {
13536       ss << "unrecognized pool '" << overlaypoolstr << "'";
13537       err = -ENOENT;
13538       goto reply_no_propose;
13539     }
13540     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13541     ceph_assert(p);
13542     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13543     ceph_assert(overlay_p);
13544     if (p->tiers.count(overlaypool_id) == 0) {
13545       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13546       err = -EINVAL;
13547       goto reply_no_propose;
13548     }
13549     if (p->read_tier == overlaypool_id) {
13550       err = 0;
13551       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13552       goto reply_no_propose;
13553     }
13554     if (p->has_read_tier()) {
13555       ss << "pool '" << poolstr << "' has overlay '"
13556          << osdmap.get_pool_name(p->read_tier)
13557          << "'; please remove-overlay first";
13558       err = -EINVAL;
13559       goto reply_no_propose;
13560     }
13561
13562     // go
13563     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13564     np->read_tier = overlaypool_id;
13565     np->write_tier = overlaypool_id;
13566     np->set_last_force_op_resend(pending_inc.epoch);
13567     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13568     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13569     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13570     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13571       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13572     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13573                                               get_last_committed() + 1));
13574     return true;
13575   } else if (prefix == "osd tier remove-overlay" ||
13576              prefix == "osd tier rm-overlay") {
13577     string poolstr;
13578     cmd_getval(cmdmap, "pool", poolstr);
13579     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13580     if (pool_id < 0) {
13581       ss << "unrecognized pool '" << poolstr << "'";
13582       err = -ENOENT;
13583       goto reply_no_propose;
13584     }
13585     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13586     ceph_assert(p);
13587     if (!p->has_read_tier()) {
13588       err = 0;
13589       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13590       goto reply_no_propose;
13591     }
13592
13593     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13594       goto reply_no_propose;
13595     }
13596
13597     // go
13598     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13599     if (np->has_read_tier()) {
13600       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13601       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13602       nop->set_last_force_op_resend(pending_inc.epoch);
13603     }
13604     if (np->has_write_tier()) {
13605       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13606       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13607       nop->set_last_force_op_resend(pending_inc.epoch);
13608     }
13609     np->clear_read_tier();
13610     np->clear_write_tier();
13611     np->set_last_force_op_resend(pending_inc.epoch);
13612     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13613     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13614                                               get_last_committed() + 1));
13615     return true;
13616   } else if (prefix == "osd tier cache-mode") {
13617     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13618     if (err == -EAGAIN)
13619       goto wait;
13620     if (err)
13621       goto reply_no_propose;
13622     string poolstr;
13623     cmd_getval(cmdmap, "pool", poolstr);
13624     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13625     if (pool_id < 0) {
13626       ss << "unrecognized pool '" << poolstr << "'";
13627       err = -ENOENT;
13628       goto reply_no_propose;
13629     }
13630     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13631     ceph_assert(p);
13632     if (!p->is_tier()) {
13633       ss << "pool '" << poolstr << "' is not a tier";
13634       err = -EINVAL;
13635       goto reply_no_propose;
13636     }
13637     string modestr;
13638     cmd_getval(cmdmap, "mode", modestr);
13639     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13640     if (int(mode) < 0) {
13641       ss << "'" << modestr << "' is not a valid cache mode";
13642       err = -EINVAL;
13643       goto reply_no_propose;
13644     }
13645
13646     bool sure = false;
13647     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13648
13649     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13650         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13651       ss << "'" << modestr << "' is no longer a supported cache mode";
13652       err = -EPERM;
13653       goto reply_no_propose;
13654     }
13655     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13656          mode != pg_pool_t::CACHEMODE_NONE &&
13657          mode != pg_pool_t::CACHEMODE_PROXY &&
13658          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13659          !sure) {
13660       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13661          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13662       err = -EPERM;
13663       goto reply_no_propose;
13664     }
13665
13666     // pool already has this cache-mode set and there are no pending changes
13667     if (p->cache_mode == mode &&
13668         (pending_inc.new_pools.count(pool_id) == 0 ||
13669          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13670       ss << "set cache-mode for pool '" << poolstr << "'"
13671          << " to " << pg_pool_t::get_cache_mode_name(mode);
13672       err = 0;
13673       goto reply_no_propose;
13674     }
13675
13676     /* Mode description:
13677      *
13678      *  none:       No cache-mode defined
13679      *  forward:    Forward all reads and writes to base pool [removed]
13680      *  writeback:  Cache writes, promote reads from base pool
13681      *  readonly:   Forward writes to base pool
13682      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13683      *  proxy:       Proxy all reads and writes to base pool
13684      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13685      *
13686      * Hence, these are the allowed transitions:
13687      *
13688      *  none -> any
13689      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13690      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13691      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13692      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13693      *  writeback -> readproxy || proxy
13694      *  readonly -> any
13695      */
13696
13697     // We check if the transition is valid against the current pool mode, as
13698     // it is the only committed state thus far.  We will blantly squash
13699     // whatever mode is on the pending state.
13700
13701     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13702         (mode != pg_pool_t::CACHEMODE_PROXY &&
13703           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13704       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13705          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13706          << "' pool; only '"
13707          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_PROXY)
13708          << "','"
13709          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13710         << "' allowed.";
13711       err = -EINVAL;
13712       goto reply_no_propose;
13713     }
13714     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13715         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13716           mode != pg_pool_t::CACHEMODE_PROXY &&
13717           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13718
13719         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13720         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13721           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13722
13723         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13724         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13725           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13726
13727         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13728         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13729           mode != pg_pool_t::CACHEMODE_PROXY &&
13730           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13731
13732       const pool_stat_t* pstats =
13733         mon.mgrstatmon()->get_pool_stat(pool_id);
13734
13735       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13736         ss << "unable to set cache-mode '"
13737            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13738            << "': dirty objects found";
13739         err = -EBUSY;
13740         goto reply_no_propose;
13741       }
13742     }
13743     // go
13744     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13745     np->cache_mode = mode;
13746     // set this both when moving to and from cache_mode NONE.  this is to
13747     // capture legacy pools that were set up before this flag existed.
13748     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13749     ss << "set cache-mode for pool '" << poolstr
13750         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13751     if (mode == pg_pool_t::CACHEMODE_NONE) {
13752       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13753       ceph_assert(base_pool);
13754       if (base_pool->read_tier == pool_id ||
13755           base_pool->write_tier == pool_id)
13756         ss <<" (WARNING: pool is still configured as read or write tier)";
13757     }
13758     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13759                                               get_last_committed() + 1));
13760     return true;
13761   } else if (prefix == "osd tier add-cache") {
13762     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13763     if (err == -EAGAIN)
13764       goto wait;
13765     if (err)
13766       goto reply_no_propose;
13767     string poolstr;
13768     cmd_getval(cmdmap, "pool", poolstr);
13769     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13770     if (pool_id < 0) {
13771       ss << "unrecognized pool '" << poolstr << "'";
13772       err = -ENOENT;
13773       goto reply_no_propose;
13774     }
13775     string tierpoolstr;
13776     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13777     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13778     if (tierpool_id < 0) {
13779       ss << "unrecognized pool '" << tierpoolstr << "'";
13780       err = -ENOENT;
13781       goto reply_no_propose;
13782     }
13783     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13784     ceph_assert(p);
13785     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13786     ceph_assert(tp);
13787
13788     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13789       goto reply_no_propose;
13790     }
13791
13792     int64_t size = 0;
13793     if (!cmd_getval(cmdmap, "size", size)) {
13794       ss << "unable to parse 'size' value '"
13795          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13796       err = -EINVAL;
13797       goto reply_no_propose;
13798     }
13799     // make sure new tier is empty
13800     const pool_stat_t *pstats =
13801       mon.mgrstatmon()->get_pool_stat(tierpool_id);
13802     if (pstats && pstats->stats.sum.num_objects != 0) {
13803       ss << "tier pool '" << tierpoolstr << "' is not empty";
13804       err = -ENOTEMPTY;
13805       goto reply_no_propose;
13806     }
13807     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13808     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13809     if (int(mode) < 0) {
13810       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13811       err = -EINVAL;
13812       goto reply_no_propose;
13813     }
13814     HitSet::Params hsp;
13815     auto& cache_hit_set_type =
13816       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13817     if (cache_hit_set_type == "bloom") {
13818       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13819       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13820       hsp = HitSet::Params(bsp);
13821     } else if (cache_hit_set_type == "explicit_hash") {
13822       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13823     } else if (cache_hit_set_type == "explicit_object") {
13824       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13825     } else {
13826       ss << "osd tier cache default hit set type '"
13827          << cache_hit_set_type << "' is not a known type";
13828       err = -EINVAL;
13829       goto reply_no_propose;
13830     }
13831     // go
13832     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13833     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13834     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13835       goto wait;
13836     }
13837     np->tiers.insert(tierpool_id);
13838     np->read_tier = np->write_tier = tierpool_id;
13839     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13840     np->set_last_force_op_resend(pending_inc.epoch);
13841     ntp->set_last_force_op_resend(pending_inc.epoch);
13842     ntp->tier_of = pool_id;
13843     ntp->cache_mode = mode;
13844     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13845     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13846     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13847     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13848     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13849     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13850     ntp->hit_set_params = hsp;
13851     ntp->target_max_bytes = size;
13852     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13853     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13854                                               get_last_committed() + 1));
13855     return true;
13856   } else if (prefix == "osd pool set-quota") {
13857     string poolstr;
13858     cmd_getval(cmdmap, "pool", poolstr);
13859     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13860     if (pool_id < 0) {
13861       ss << "unrecognized pool '" << poolstr << "'";
13862       err = -ENOENT;
13863       goto reply_no_propose;
13864     }
13865
13866     string field;
13867     cmd_getval(cmdmap, "field", field);
13868     if (field != "max_objects" && field != "max_bytes") {
13869       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13870       err = -EINVAL;
13871       goto reply_no_propose;
13872     }
13873
13874     // val could contain unit designations, so we treat as a string
13875     string val;
13876     cmd_getval(cmdmap, "val", val);
13877     string tss;
13878     int64_t value;
13879     if (field == "max_objects") {
13880       value = strict_si_cast<uint64_t>(val, &tss);
13881     } else if (field == "max_bytes") {
13882       value = strict_iecstrtoll(val, &tss);
13883     } else {
13884       ceph_abort_msg("unrecognized option");
13885     }
13886     if (!tss.empty()) {
13887       ss << "error parsing value '" << val << "': " << tss;
13888       err = -EINVAL;
13889       goto reply_no_propose;
13890     }
13891
13892     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13893     if (field == "max_objects") {
13894       pi->quota_max_objects = value;
13895     } else if (field == "max_bytes") {
13896       pi->quota_max_bytes = value;
13897     } else {
13898       ceph_abort_msg("unrecognized option");
13899     }
13900     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13901     rs = ss.str();
13902     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13903                                               get_last_committed() + 1));
13904     return true;
13905   } else if (prefix == "osd pool application enable" ||
13906              prefix == "osd pool application disable" ||
13907              prefix == "osd pool application set" ||
13908              prefix == "osd pool application rm") {
13909     err = prepare_command_pool_application(prefix, cmdmap, ss);
13910     if (err == -EAGAIN) {
13911       goto wait;
13912     } else if (err < 0) {
13913       goto reply_no_propose;
13914     } else {
13915       goto update;
13916     }
13917   } else if (prefix == "osd force-create-pg") {
13918     pg_t pgid;
13919     string pgidstr;
13920     err = parse_pgid(cmdmap, ss, pgid, pgidstr);
13921     if (err < 0)
13922       goto reply_no_propose;
13923     bool sure = false;
13924     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13925     if (!sure) {
13926       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13927          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13928          << "only if you are certain that all copies of the PG are in fact lost and you are "
13929          << "willing to accept that the data is permanently destroyed.  Pass "
13930          << "--yes-i-really-mean-it to proceed.";
13931       err = -EPERM;
13932       goto reply_no_propose;
13933     }
13934     bool creating_now;
13935     {
13936       std::lock_guard<std::mutex> l(creating_pgs_lock);
13937       auto emplaced = creating_pgs.pgs.emplace(
13938         pgid,
13939         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13940                                        ceph_clock_now()));
13941       creating_now = emplaced.second;
13942     }
13943     if (creating_now) {
13944       ss << "pg " << pgidstr << " now creating, ok";
13945       // set the pool's CREATING flag so that (1) the osd won't ignore our
13946       // create message and (2) we won't propose any future pg_num changes
13947       // until after the PG has been instantiated.
13948       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13949         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13950       }
13951       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13952       err = 0;
13953       goto update;
13954     } else {
13955       ss << "pg " << pgid << " already creating";
13956       err = 0;
13957       goto reply_no_propose;
13958     }
13959   } else if (prefix == "osd force_healthy_stretch_mode") {
13960     bool sure = false;
13961     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13962     if (!sure) {
13963       ss << "This command will require peering across multiple CRUSH buckets "
13964         "(probably two data centers or availability zones?) and may result in PGs "
13965         "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13966       err = -EPERM;
13967       goto reply_no_propose;
13968     }
13969     try_end_recovery_stretch_mode(true);
13970     ss << "Triggering healthy stretch mode";
13971     err = 0;
13972     goto reply_no_propose;
13973   } else if (prefix == "osd force_recovery_stretch_mode") {
13974     bool sure = false;
13975     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13976     if (!sure) {
13977       ss << "This command will increase pool sizes to try and spread them "
13978         "across multiple CRUSH buckets (probably two data centers or "
13979         "availability zones?) and should have happened automatically"
13980         "Pass --yes-i-really-mean-it to proceed.";
13981       err = -EPERM;
13982       goto reply_no_propose;
13983     }
13984     mon.go_recovery_stretch_mode();
13985     ss << "Triggering recovery stretch mode";
13986     err = 0;
13987     goto reply_no_propose;
13988   } else if (prefix == "osd set-allow-crimson") {
13989
13990     bool sure = false;
13991     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13992
13993     bool experimental_enabled =
13994       g_ceph_context->check_experimental_feature_enabled("crimson");
13995     if (!sure || !experimental_enabled) {
13996       ss << "This command will allow usage of crimson-osd osd daemons.  "
13997          << "crimson-osd is not considered stable and will likely cause "
13998          << "crashes or data corruption.  At this time, crimson-osd is mainly "
13999          << "useful for performance evaluation, testing, and development.  "
14000          << "If you are sure, add --yes-i-really-mean-it and add 'crimson' to "
14001          << "the experimental features config.  This setting is irrevocable.";
14002       err = -EPERM;
14003       goto reply_no_propose;
14004     }
14005
14006     err = 0;
14007     if (osdmap.get_allow_crimson()) {
14008       goto reply_no_propose;
14009     } else {
14010       pending_inc.set_allow_crimson();
14011       goto update;
14012     }
14013   } else {
14014     err = -EINVAL;
14015   }
14016
14017  reply_no_propose:
14018   getline(ss, rs);
14019   if (err < 0 && rs.length() == 0)
14020     rs = cpp_strerror(err);
14021   mon.reply_command(op, err, rs, rdata, get_last_committed());
14022   return false; /* nothing to propose */
14023
14024  update:
14025   getline(ss, rs);
14026   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
14027                                             get_last_committed() + 1));
14028   return true;
14029
14030  wait:
14031   // XXX
14032   // Some osd commands split changes across two epochs.
14033   // It seems this is mostly for crush rule changes. It doesn't need
14034   // to be this way but it's a bit of work to fix that. For now,
14035   // trigger a proposal by returning true and then retry the command
14036   // to complete the operation.
14037   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14038   return true;
14039 }
14040
14041 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
14042 {
14043   op->mark_osdmon_event(__func__);
14044
14045   auto m = op->get_req<MPoolOp>();
14046   MonSession *session = op->get_session();
14047   if (!session) {
14048     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14049     return true;
14050   }
14051
14052   switch (m->op) {
14053   case POOL_OP_CREATE_UNMANAGED_SNAP:
14054   case POOL_OP_DELETE_UNMANAGED_SNAP:
14055     {
14056       const std::string* pool_name = nullptr;
14057       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
14058       if (pg_pool != nullptr) {
14059         pool_name = &osdmap.get_pool_name(m->pool);
14060       }
14061
14062       if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
14063                                           session->entity_name, session->caps,
14064                                           session->get_peer_socket_addr(),
14065                                           pool_name)) {
14066         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
14067                 << "privileges. message: " << *m  << std::endl
14068                 << "caps: " << session->caps << dendl;
14069         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14070         return true;
14071       }
14072     }
14073     break;
14074   default:
14075     if (!session->is_capable("osd", MON_CAP_W)) {
14076       dout(0) << "got pool op from entity with insufficient privileges. "
14077               << "message: " << *m  << std::endl
14078               << "caps: " << session->caps << dendl;
14079       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
14080       return true;
14081     }
14082     break;
14083   }
14084
14085   return false;
14086 }
14087
14088 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
14089 {
14090   op->mark_osdmon_event(__func__);
14091   auto m = op->get_req<MPoolOp>();
14092
14093   if (enforce_pool_op_caps(op)) {
14094     return true;
14095   }
14096
14097   if (m->fsid != mon.monmap->fsid) {
14098     dout(0) << __func__ << " drop message on fsid " << m->fsid
14099             << " != " << mon.monmap->fsid << " for " << *m << dendl;
14100     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14101     return true;
14102   }
14103
14104   if (m->op == POOL_OP_CREATE)
14105     return preprocess_pool_op_create(op);
14106
14107   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
14108   if (p == nullptr) {
14109     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
14110     if (m->op == POOL_OP_DELETE) {
14111       _pool_op_reply(op, 0, osdmap.get_epoch());
14112     } else {
14113       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14114     }
14115     return true;
14116   }
14117
14118   // check if the snap and snapname exist
14119   bool snap_exists = false;
14120   if (p->snap_exists(m->name.c_str()))
14121     snap_exists = true;
14122
14123   switch (m->op) {
14124   case POOL_OP_CREATE_SNAP:
14125     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
14126       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14127       return true;
14128     }
14129     if (snap_exists) {
14130       _pool_op_reply(op, 0, osdmap.get_epoch());
14131       return true;
14132     }
14133     return false;
14134   case POOL_OP_CREATE_UNMANAGED_SNAP:
14135     if (p->is_pool_snaps_mode()) {
14136       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14137       return true;
14138     }
14139     return false;
14140   case POOL_OP_DELETE_SNAP:
14141     if (p->is_unmanaged_snaps_mode()) {
14142       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14143       return true;
14144     }
14145     if (!snap_exists) {
14146       _pool_op_reply(op, 0, osdmap.get_epoch());
14147       return true;
14148     }
14149     return false;
14150   case POOL_OP_DELETE_UNMANAGED_SNAP:
14151     if (p->is_pool_snaps_mode()) {
14152       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14153       return true;
14154     }
14155     if (_is_removed_snap(m->pool, m->snapid)) {
14156       _pool_op_reply(op, 0, osdmap.get_epoch());
14157       return true;
14158     }
14159     return false;
14160   case POOL_OP_DELETE:
14161     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
14162       _pool_op_reply(op, 0, osdmap.get_epoch());
14163       return true;
14164     }
14165     return false;
14166   case POOL_OP_AUID_CHANGE:
14167     return false;
14168   default:
14169     ceph_abort();
14170     break;
14171   }
14172
14173   return false;
14174 }
14175
14176 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
14177 {
14178   if (!osdmap.have_pg_pool(pool)) {
14179     dout(10) << __func__ << " pool " << pool << " snap " << snap
14180              << " - pool dne" << dendl;
14181     return true;
14182   }
14183   if (osdmap.in_removed_snaps_queue(pool, snap)) {
14184     dout(10) << __func__ << " pool " << pool << " snap " << snap
14185              << " - in osdmap removed_snaps_queue" << dendl;
14186     return true;
14187   }
14188   snapid_t begin, end;
14189   int r = lookup_purged_snap(pool, snap, &begin, &end);
14190   if (r == 0) {
14191     dout(10) << __func__ << " pool " << pool << " snap " << snap
14192              << " - purged, [" << begin << "," << end << ")" << dendl;
14193     return true;
14194   }
14195   return false;
14196 }
14197
14198 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14199 {
14200   if (pending_inc.old_pools.count(pool)) {
14201     dout(10) << __func__ << " pool " << pool << " snap " << snap
14202              << " - pool pending deletion" << dendl;
14203     return true;
14204   }
14205   if (pending_inc.in_new_removed_snaps(pool, snap)) {
14206     dout(10) << __func__ << " pool " << pool << " snap " << snap
14207              << " - in pending new_removed_snaps" << dendl;
14208     return true;
14209   }
14210   return false;
14211 }
14212
14213 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14214 {
14215   op->mark_osdmon_event(__func__);
14216   auto m = op->get_req<MPoolOp>();
14217   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14218   if (pool >= 0) {
14219     _pool_op_reply(op, 0, osdmap.get_epoch());
14220     return true;
14221   }
14222
14223   return false;
14224 }
14225
14226 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14227 {
14228   op->mark_osdmon_event(__func__);
14229   auto m = op->get_req<MPoolOp>();
14230   dout(10) << "prepare_pool_op " << *m << dendl;
14231   if (m->op == POOL_OP_CREATE) {
14232     return prepare_pool_op_create(op);
14233   } else if (m->op == POOL_OP_DELETE) {
14234     return prepare_pool_op_delete(op);
14235   }
14236
14237   int ret = 0;
14238   bool changed = false;
14239
14240   if (!osdmap.have_pg_pool(m->pool)) {
14241     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14242     return false;
14243   }
14244
14245   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14246
14247   if (m->op == POOL_OP_CREATE_SNAP ||
14248       m->op == POOL_OP_CREATE_UNMANAGED_SNAP) {
14249     if (const auto& fsmap = mon.mdsmon()->get_fsmap(); fsmap.pool_in_use(m->pool)) {
14250       dout(20) << "monitor-managed snapshots have been disabled for pools "
14251                   " attached to an fs - pool:" << m->pool << dendl;
14252       _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14253       return false;
14254     }
14255   }
14256
14257   switch (m->op) {
14258     case POOL_OP_CREATE_SNAP:
14259       if (pool->is_tier()) {
14260         ret = -EINVAL;
14261         _pool_op_reply(op, ret, osdmap.get_epoch());
14262         return false;
14263       }  // else, fall through
14264     case POOL_OP_DELETE_SNAP:
14265       if (!pool->is_unmanaged_snaps_mode()) {
14266         bool snap_exists = pool->snap_exists(m->name.c_str());
14267         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14268           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14269           ret = 0;
14270         } else {
14271           break;
14272         }
14273       } else {
14274         ret = -EINVAL;
14275       }
14276       _pool_op_reply(op, ret, osdmap.get_epoch());
14277       return false;
14278
14279     case POOL_OP_DELETE_UNMANAGED_SNAP:
14280       // we won't allow removal of an unmanaged snapshot from a pool
14281       // not in unmanaged snaps mode.
14282       if (!pool->is_unmanaged_snaps_mode()) {
14283         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14284         return false;
14285       }
14286       /* fall-thru */
14287     case POOL_OP_CREATE_UNMANAGED_SNAP:
14288       // but we will allow creating an unmanaged snapshot on any pool
14289       // as long as it is not in 'pool' snaps mode.
14290       if (pool->is_pool_snaps_mode()) {
14291         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14292         return false;
14293       }
14294   }
14295
14296   // projected pool info
14297   pg_pool_t pp;
14298   if (pending_inc.new_pools.count(m->pool))
14299     pp = pending_inc.new_pools[m->pool];
14300   else
14301     pp = *osdmap.get_pg_pool(m->pool);
14302
14303   bufferlist reply_data;
14304
14305   // pool snaps vs unmanaged snaps are mutually exclusive
14306   switch (m->op) {
14307   case POOL_OP_CREATE_SNAP:
14308   case POOL_OP_DELETE_SNAP:
14309     if (pp.is_unmanaged_snaps_mode()) {
14310       ret = -EINVAL;
14311       goto out;
14312     }
14313     break;
14314
14315   case POOL_OP_CREATE_UNMANAGED_SNAP:
14316   case POOL_OP_DELETE_UNMANAGED_SNAP:
14317     if (pp.is_pool_snaps_mode()) {
14318       ret = -EINVAL;
14319       goto out;
14320     }
14321   }
14322
14323   switch (m->op) {
14324   case POOL_OP_CREATE_SNAP:
14325     if (!pp.snap_exists(m->name.c_str())) {
14326       pp.add_snap(m->name.c_str(), ceph_clock_now());
14327       dout(10) << "create snap in pool " << m->pool << " " << m->name
14328                << " seq " << pp.get_snap_epoch() << dendl;
14329       changed = true;
14330     }
14331     break;
14332
14333   case POOL_OP_DELETE_SNAP:
14334     {
14335       snapid_t s = pp.snap_exists(m->name.c_str());
14336       if (s) {
14337         pp.remove_snap(s);
14338         pending_inc.new_removed_snaps[m->pool].insert(s);
14339         changed = true;
14340       }
14341     }
14342     break;
14343
14344   case POOL_OP_CREATE_UNMANAGED_SNAP:
14345     {
14346       uint64_t snapid = pp.add_unmanaged_snap(
14347         osdmap.require_osd_release < ceph_release_t::octopus);
14348       encode(snapid, reply_data);
14349       changed = true;
14350     }
14351     break;
14352
14353   case POOL_OP_DELETE_UNMANAGED_SNAP:
14354     if (!_is_removed_snap(m->pool, m->snapid) &&
14355         !_is_pending_removed_snap(m->pool, m->snapid)) {
14356       if (m->snapid > pp.get_snap_seq()) {
14357         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14358         return false;
14359       }
14360       pp.remove_unmanaged_snap(
14361         m->snapid,
14362         osdmap.require_osd_release < ceph_release_t::octopus);
14363       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14364       // also record the new seq as purged: this avoids a discontinuity
14365       // after all of the snaps have been purged, since the seq assigned
14366       // during removal lives in the same namespace as the actual snaps.
14367       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14368       changed = true;
14369     }
14370     break;
14371
14372   case POOL_OP_AUID_CHANGE:
14373     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14374     return false;
14375
14376   default:
14377     ceph_abort();
14378     break;
14379   }
14380
14381   if (changed) {
14382     pp.set_snap_epoch(pending_inc.epoch);
14383     pending_inc.new_pools[m->pool] = pp;
14384   }
14385
14386  out:
14387   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14388   return true;
14389 }
14390
14391 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14392 {
14393   op->mark_osdmon_event(__func__);
14394   int err = prepare_new_pool(op);
14395   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14396   return true;
14397 }
14398
14399 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14400                                    ostream *ss)
14401 {
14402   const string& poolstr = osdmap.get_pool_name(pool_id);
14403
14404   // If the Pool is in use by CephFS, refuse to delete it
14405   FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14406   if (pending_fsmap.pool_in_use(pool_id)) {
14407     *ss << "pool '" << poolstr << "' is in use by CephFS";
14408     return -EBUSY;
14409   }
14410
14411   if (pool.tier_of >= 0) {
14412     *ss << "pool '" << poolstr << "' is a tier of '"
14413         << osdmap.get_pool_name(pool.tier_of) << "'";
14414     return -EBUSY;
14415   }
14416   if (!pool.tiers.empty()) {
14417     *ss << "pool '" << poolstr << "' has tiers";
14418     for(auto tier : pool.tiers) {
14419       *ss << " " << osdmap.get_pool_name(tier);
14420     }
14421     return -EBUSY;
14422   }
14423
14424   if (!g_conf()->mon_allow_pool_delete) {
14425     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14426     return -EPERM;
14427   }
14428
14429   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14430     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14431     return -EPERM;
14432   }
14433
14434   *ss << "pool '" << poolstr << "' removed";
14435   return 0;
14436 }
14437
14438 /**
14439  * Check if it is safe to add a tier to a base pool
14440  *
14441  * @return
14442  * True if the operation should proceed, false if we should abort here
14443  * (abort doesn't necessarily mean error, could be idempotency)
14444  */
14445 bool OSDMonitor::_check_become_tier(
14446     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14447     const int64_t base_pool_id, const pg_pool_t *base_pool,
14448     int *err,
14449     ostream *ss) const
14450 {
14451   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14452   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14453
14454   if (tier_pool->is_crimson()) {
14455     *ss << "pool '" << tier_pool_name << "' is a crimson pool, tiering "
14456         << "features are not supported";
14457     *err = -EINVAL;
14458     return false;
14459   }
14460   if (base_pool->is_crimson()) {
14461     *ss << "pool '" << base_pool_name << "' is a crimson pool, tiering "
14462         << "features are not supported";
14463     *err = -EINVAL;
14464     return false;
14465   }
14466
14467   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14468   if (pending_fsmap.pool_in_use(tier_pool_id)) {
14469     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14470     *err = -EBUSY;
14471     return false;
14472   }
14473
14474   if (base_pool->tiers.count(tier_pool_id)) {
14475     ceph_assert(tier_pool->tier_of == base_pool_id);
14476     *err = 0;
14477     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14478       << base_pool_name << "'";
14479     return false;
14480   }
14481
14482   if (base_pool->is_tier()) {
14483     *ss << "pool '" << base_pool_name << "' is already a tier of '"
14484       << osdmap.get_pool_name(base_pool->tier_of) << "', "
14485       << "multiple tiers are not yet supported.";
14486     *err = -EINVAL;
14487     return false;
14488   }
14489
14490   if (tier_pool->has_tiers()) {
14491     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14492     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14493          it != tier_pool->tiers.end(); ++it)
14494       *ss << "'" << osdmap.get_pool_name(*it) << "',";
14495     *ss << " multiple tiers are not yet supported.";
14496     *err = -EINVAL;
14497     return false;
14498   }
14499
14500   if (tier_pool->is_tier()) {
14501     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14502        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14503     *err = -EINVAL;
14504     return false;
14505   }
14506
14507   *err = 0;
14508   return true;
14509 }
14510
14511
14512 /**
14513  * Check if it is safe to remove a tier from this base pool
14514  *
14515  * @return
14516  * True if the operation should proceed, false if we should abort here
14517  * (abort doesn't necessarily mean error, could be idempotency)
14518  */
14519 bool OSDMonitor::_check_remove_tier(
14520     const int64_t base_pool_id, const pg_pool_t *base_pool,
14521     const pg_pool_t *tier_pool,
14522     int *err, ostream *ss) const
14523 {
14524   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14525
14526   // Apply CephFS-specific checks
14527   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14528   if (pending_fsmap.pool_in_use(base_pool_id)) {
14529     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14530       // If the underlying pool is erasure coded and does not allow EC
14531       // overwrites, we can't permit the removal of the replicated tier that
14532       // CephFS relies on to access it
14533       *ss << "pool '" << base_pool_name <<
14534           "' does not allow EC overwrites and is in use by CephFS"
14535           " via its tier";
14536       *err = -EBUSY;
14537       return false;
14538     }
14539
14540     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14541       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14542              "tier is still in use as a writeback cache.  Change the cache "
14543              "mode and flush the cache before removing it";
14544       *err = -EBUSY;
14545       return false;
14546     }
14547   }
14548
14549   *err = 0;
14550   return true;
14551 }
14552
14553 int OSDMonitor::_prepare_remove_pool(
14554   int64_t pool, ostream *ss, bool no_fake)
14555 {
14556   dout(10) << __func__ << " " << pool << dendl;
14557   const pg_pool_t *p = osdmap.get_pg_pool(pool);
14558   int r = _check_remove_pool(pool, *p, ss);
14559   if (r < 0)
14560     return r;
14561
14562   auto new_pool = pending_inc.new_pools.find(pool);
14563   if (new_pool != pending_inc.new_pools.end()) {
14564     // if there is a problem with the pending info, wait and retry
14565     // this op.
14566     const auto& p = new_pool->second;
14567     int r = _check_remove_pool(pool, p, ss);
14568     if (r < 0)
14569       return -EAGAIN;
14570   }
14571
14572   if (pending_inc.old_pools.count(pool)) {
14573     dout(10) << __func__ << " " << pool << " already pending removal"
14574              << dendl;
14575     return 0;
14576   }
14577
14578   if (g_conf()->mon_fake_pool_delete && !no_fake) {
14579     string old_name = osdmap.get_pool_name(pool);
14580     string new_name = old_name + "." + stringify(pool) + ".DELETED";
14581     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14582             << old_name << " -> " << new_name << dendl;
14583     pending_inc.new_pool_names[pool] = new_name;
14584     return 0;
14585   }
14586
14587   // remove
14588   pending_inc.old_pools.insert(pool);
14589
14590   // remove any pg_temp mappings for this pool
14591   for (auto p = osdmap.pg_temp->begin();
14592        p != osdmap.pg_temp->end();
14593        ++p) {
14594     if (p->first.pool() == pool) {
14595       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14596                << p->first << dendl;
14597       pending_inc.new_pg_temp[p->first].clear();
14598     }
14599   }
14600   // remove any primary_temp mappings for this pool
14601   for (auto p = osdmap.primary_temp->begin();
14602       p != osdmap.primary_temp->end();
14603       ++p) {
14604     if (p->first.pool() == pool) {
14605       dout(10) << __func__ << " " << pool
14606                << " removing obsolete primary_temp" << p->first << dendl;
14607       pending_inc.new_primary_temp[p->first] = -1;
14608     }
14609   }
14610   // remove any pg_upmap mappings for this pool
14611   for (auto& p : osdmap.pg_upmap) {
14612     if (p.first.pool() == pool) {
14613       dout(10) << __func__ << " " << pool
14614                << " removing obsolete pg_upmap "
14615                << p.first << dendl;
14616       pending_inc.old_pg_upmap.insert(p.first);
14617     }
14618   }
14619   // remove any pending pg_upmap mappings for this pool
14620   {
14621     auto it = pending_inc.new_pg_upmap.begin();
14622     while (it != pending_inc.new_pg_upmap.end()) {
14623       if (it->first.pool() == pool) {
14624         dout(10) << __func__ << " " << pool
14625                  << " removing pending pg_upmap "
14626                  << it->first << dendl;
14627         it = pending_inc.new_pg_upmap.erase(it);
14628       } else {
14629         it++;
14630       }
14631     }
14632   }
14633   // remove any pg_upmap_items mappings for this pool
14634   for (auto& p : osdmap.pg_upmap_items) {
14635     if (p.first.pool() == pool) {
14636       dout(10) << __func__ << " " << pool
14637                << " removing obsolete pg_upmap_items " << p.first
14638                << dendl;
14639       pending_inc.old_pg_upmap_items.insert(p.first);
14640     }
14641   }
14642   // remove any pending pg_upmap mappings for this pool
14643   {
14644     auto it = pending_inc.new_pg_upmap_items.begin();
14645     while (it != pending_inc.new_pg_upmap_items.end()) {
14646       if (it->first.pool() == pool) {
14647         dout(10) << __func__ << " " << pool
14648                  << " removing pending pg_upmap_items "
14649                  << it->first << dendl;
14650         it = pending_inc.new_pg_upmap_items.erase(it);
14651       } else {
14652         it++;
14653       }
14654     }
14655   }
14656
14657   // remove any choose_args for this pool
14658   CrushWrapper newcrush = _get_pending_crush();
14659   if (newcrush.have_choose_args(pool)) {
14660     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14661     newcrush.rm_choose_args(pool);
14662     pending_inc.crush.clear();
14663     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14664   }
14665   return 0;
14666 }
14667
14668 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14669 {
14670   dout(10) << "_prepare_rename_pool " << pool << dendl;
14671   if (pending_inc.old_pools.count(pool)) {
14672     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14673     return -ENOENT;
14674   }
14675   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14676        p != pending_inc.new_pool_names.end();
14677        ++p) {
14678     if (p->second == newname && p->first != pool) {
14679       return -EEXIST;
14680     }
14681   }
14682
14683   pending_inc.new_pool_names[pool] = newname;
14684   return 0;
14685 }
14686
14687 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14688 {
14689   op->mark_osdmon_event(__func__);
14690   auto m = op->get_req<MPoolOp>();
14691   ostringstream ss;
14692   int ret = _prepare_remove_pool(m->pool, &ss, false);
14693   if (ret == -EAGAIN) {
14694     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14695     return true;
14696   }
14697   if (ret < 0)
14698     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14699   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14700                                                       pending_inc.epoch));
14701   return true;
14702 }
14703
14704 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14705                                 int ret, epoch_t epoch, bufferlist *blp)
14706 {
14707   op->mark_osdmon_event(__func__);
14708   auto m = op->get_req<MPoolOp>();
14709   dout(20) << "_pool_op_reply " << ret << dendl;
14710   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14711                                          ret, epoch, get_last_committed(), blp);
14712   mon.send_reply(op, reply);
14713 }
14714
14715 void OSDMonitor::convert_pool_priorities(void)
14716 {
14717   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14718   int64_t max_prio = 0;
14719   int64_t min_prio = 0;
14720   for (const auto &i : osdmap.get_pools()) {
14721     const auto &pool = i.second;
14722
14723     if (pool.opts.is_set(key)) {
14724       int64_t prio = 0;
14725       pool.opts.get(key, &prio);
14726       if (prio > max_prio)
14727         max_prio = prio;
14728       if (prio < min_prio)
14729         min_prio = prio;
14730     }
14731   }
14732   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14733     dout(20) << __func__ << " nothing to fix" << dendl;
14734     return;
14735   }
14736   // Current pool priorities exceeds new maximum
14737   for (const auto &i : osdmap.get_pools()) {
14738     const auto pool_id = i.first;
14739     pg_pool_t pool = i.second;
14740
14741     int64_t prio = 0;
14742     pool.opts.get(key, &prio);
14743     int64_t n;
14744
14745     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14746       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14747       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14748     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14749       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14750       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14751     } else {
14752       continue;
14753     }
14754     if (n == 0) {
14755       pool.opts.unset(key);
14756     } else {
14757       pool.opts.set(key, static_cast<int64_t>(n));
14758     }
14759     dout(10) << __func__ << " pool " << pool_id
14760              << " recovery_priority adjusted "
14761              << prio << " to " << n << dendl;
14762     pool.last_change = pending_inc.epoch;
14763     pending_inc.new_pools[pool_id] = pool;
14764   }
14765 }
14766
14767 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14768                                                int *errcode,
14769                                                set<pg_pool_t*>* pools,
14770                                                const string& new_crush_rule)
14771 {
14772   dout(20) << __func__ << dendl;
14773   *okay = false;
14774   int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14775   if (new_crush_rule_result < 0) {
14776     ss << "unrecognized crush rule " << new_crush_rule_result;
14777     *errcode = new_crush_rule_result;
14778     return;
14779   }
14780   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14781   for (const auto& pooli : osdmap.pools) {
14782     int64_t poolid = pooli.first;
14783     const pg_pool_t *p = &pooli.second;
14784     if (!p->is_replicated()) {
14785       ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14786       *errcode = -EINVAL;
14787       return;
14788     }
14789     uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14790     if ((p->get_size() != default_size ||
14791          (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14792         (p->get_crush_rule() != new_rule)) {
14793       ss << "we currently require stretch mode pools start out with the"
14794         " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14795       *errcode = -EINVAL;
14796       return;
14797     }
14798     pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14799     // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14800     // the attempt may fail and then we have these pool updates...but they won't do anything
14801     // if there is a failure, so if it's hard to change the interface, no need to bother
14802     pools->insert(pp);
14803   }
14804   *okay = true;
14805   return;
14806 }
14807
14808 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14809                                          int *errcode, bool commit,
14810                                          const string& dividing_bucket,
14811                                          uint32_t bucket_count,
14812                                          const set<pg_pool_t*>& pools,
14813                                          const string& new_crush_rule)
14814 {
14815   dout(20) << __func__ << dendl;
14816   *okay = false;
14817   CrushWrapper crush = _get_pending_crush();
14818   int dividing_id = -1;
14819   if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14820       !type_id.has_value()) {
14821     ss << dividing_bucket << " is not a valid crush bucket type";
14822     *errcode = -ENOENT;
14823     ceph_assert(!commit);
14824     return;
14825   } else {
14826     dividing_id = *type_id;
14827   }
14828   vector<int> subtrees;
14829   crush.get_subtree_of_type(dividing_id, &subtrees);
14830   if (subtrees.size() != 2) {
14831     ss << "there are " << subtrees.size() << dividing_bucket
14832        << "'s in the cluster but stretch mode currently only works with 2!";
14833     *errcode = -EINVAL;
14834     ceph_assert(!commit || subtrees.size() == 2);
14835     return;
14836   }
14837
14838   int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14839   if (new_crush_rule_result < 0) {
14840     ss << "unrecognized crush rule " << new_crush_rule;
14841     *errcode = new_crush_rule_result;
14842     ceph_assert(!commit || (new_crush_rule_result > 0));
14843     return;
14844   }
14845   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14846
14847   int weight1 = crush.get_item_weight(subtrees[0]);
14848   int weight2 = crush.get_item_weight(subtrees[1]);
14849   if (weight1 != weight2) {
14850     // TODO: I'm really not sure this is a good idea?
14851     ss << "the 2 " << dividing_bucket
14852        << "instances in the cluster have differing weights "
14853        << weight1 << " and " << weight2
14854        <<" but stretch mode currently requires they be the same!";
14855     *errcode = -EINVAL;
14856     ceph_assert(!commit || (weight1 == weight2));
14857     return;
14858   }
14859   if (bucket_count != 2) {
14860     ss << "currently we only support 2-site stretch clusters!";
14861     *errcode = -EINVAL;
14862     ceph_assert(!commit || bucket_count == 2);
14863     return;
14864   }
14865   // TODO: check CRUSH rules for pools so that we are appropriately divided
14866   if (commit) {
14867     for (auto pool : pools) {
14868       pool->crush_rule = new_rule;
14869       pool->peering_crush_bucket_count = bucket_count;
14870       pool->peering_crush_bucket_target = bucket_count;
14871       pool->peering_crush_bucket_barrier = dividing_id;
14872       pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14873       pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14874       pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14875     }
14876     pending_inc.change_stretch_mode = true;
14877     pending_inc.stretch_mode_enabled = true;
14878     pending_inc.new_stretch_bucket_count = bucket_count;
14879     pending_inc.new_degraded_stretch_mode = 0;
14880     pending_inc.new_stretch_mode_bucket = dividing_id;
14881   }
14882   *okay = true;
14883   return;
14884 }
14885
14886 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14887                                             set<int> *really_down_buckets,
14888                                             set<string> *really_down_mons)
14889 {
14890   dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14891   ceph_assert(is_readable());
14892   if (dead_buckets.empty()) return false;
14893   set<int> down_cache;
14894   bool really_down = false;
14895   for (auto dbi : dead_buckets) {
14896     const string& bucket_name = dbi.first;
14897     ceph_assert(osdmap.crush->name_exists(bucket_name));
14898     int bucket_id = osdmap.crush->get_item_id(bucket_name);
14899     dout(20) << "Checking " << bucket_name << " id " << bucket_id
14900              << " to see if OSDs are also down" << dendl;
14901     bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14902     if (subtree_down) {
14903       dout(20) << "subtree is down!" << dendl;
14904       really_down = true;
14905       really_down_buckets->insert(bucket_id);
14906       really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14907     }
14908   }
14909   dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14910            << " and mons " << *really_down_mons << " are really down" << dendl;
14911   return really_down;
14912 }
14913
14914 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14915                                                const set<string>& live_zones)
14916 {
14917   dout(20) << __func__ << dendl;
14918   stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14919   // update the general OSDMap changes
14920   pending_inc.change_stretch_mode = true;
14921   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14922   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14923   int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14924   ceph_assert(new_site_count == 1); // stretch count 2!
14925   pending_inc.new_degraded_stretch_mode = new_site_count;
14926   pending_inc.new_recovering_stretch_mode = 0;
14927   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14928
14929   // and then apply them to all the pg_pool_ts
14930   ceph_assert(live_zones.size() == 1); // only support 2 zones now
14931   const string& remaining_site_name = *(live_zones.begin());
14932   ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14933   int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14934   for (auto pgi : osdmap.pools) {
14935     if (pgi.second.peering_crush_bucket_count) {
14936       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14937       newp.peering_crush_bucket_count = new_site_count;
14938       newp.peering_crush_mandatory_member = remaining_site;
14939       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14940       newp.set_last_force_op_resend(pending_inc.epoch);
14941     }
14942   }
14943   propose_pending();
14944 }
14945
14946 void OSDMonitor::trigger_recovery_stretch_mode()
14947 {
14948   dout(20) << __func__ << dendl;
14949   stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14950   pending_inc.change_stretch_mode = true;
14951   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14952   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14953   pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14954   pending_inc.new_recovering_stretch_mode = 1;
14955   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14956
14957   for (auto pgi : osdmap.pools) {
14958     if (pgi.second.peering_crush_bucket_count) {
14959       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14960       newp.set_last_force_op_resend(pending_inc.epoch);
14961     }
14962   }
14963   propose_pending();
14964 }
14965
14966 void OSDMonitor::set_degraded_stretch_mode()
14967 {
14968   stretch_recovery_triggered.set_from_double(0);
14969 }
14970
14971 void OSDMonitor::set_recovery_stretch_mode()
14972 {
14973   if (stretch_recovery_triggered.is_zero()) {
14974     stretch_recovery_triggered = ceph_clock_now();
14975   }
14976 }
14977
14978 void OSDMonitor::set_healthy_stretch_mode()
14979 {
14980   stretch_recovery_triggered.set_from_double(0);
14981 }
14982
14983 void OSDMonitor::notify_new_pg_digest()
14984 {
14985   dout(20) << __func__ << dendl;
14986   if (!stretch_recovery_triggered.is_zero()) {
14987     try_end_recovery_stretch_mode(false);
14988   }
14989 }
14990
14991 struct CMonExitRecovery : public Context {
14992   OSDMonitor *m;
14993   bool force;
14994   CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14995   void finish(int r) {
14996     m->try_end_recovery_stretch_mode(force);
14997   }
14998 };
14999
15000 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
15001 {
15002   dout(20) << __func__ << dendl;
15003   if (!mon.is_leader()) return;
15004   if (!mon.is_degraded_stretch_mode()) return;
15005   if (!mon.is_recovering_stretch_mode()) return;
15006   if (!is_readable()) {
15007     wait_for_readable_ctx(new CMonExitRecovery(this, force));
15008     return;
15009   }
15010
15011   if (osdmap.recovering_stretch_mode &&
15012       ((!stretch_recovery_triggered.is_zero() &&
15013         ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
15014         stretch_recovery_triggered) ||
15015        force)) {
15016     if (!mon.mgrstatmon()->is_readable()) {
15017       mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
15018       return;
15019     }
15020     const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
15021     double misplaced, degraded, inactive, unknown;
15022     pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
15023     if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
15024       // we can exit degraded stretch mode!
15025       mon.trigger_healthy_stretch_mode();
15026     }
15027   }
15028 }
15029
15030 void OSDMonitor::trigger_healthy_stretch_mode()
15031 {
15032   ceph_assert(is_writeable());
15033   stretch_recovery_triggered.set_from_double(0);
15034   pending_inc.change_stretch_mode = true;
15035   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
15036   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
15037   pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
15038   pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
15039   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
15040   for (auto pgi : osdmap.pools) {
15041     if (pgi.second.peering_crush_bucket_count) {
15042       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
15043       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
15044       newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
15045       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
15046       newp.set_last_force_op_resend(pending_inc.epoch);
15047     }
15048   }
15049   propose_pending();
15050 }