ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/KVMonitor.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 using std::dec;
  95 using std::hex;
  96 using std::list;
  97 using std::map;
  98 using std::make_pair;
  99 using std::ostringstream;
 100 using std::pair;
 101 using std::set;
 102 using std::string;
 103 using std::stringstream;
 104 using std::to_string;
 105 using std::vector;
 106
 107 using ceph::bufferlist;
 108 using ceph::decode;
 109 using ceph::encode;
 110 using ceph::ErasureCodeInterfaceRef;
 111 using ceph::ErasureCodePluginRegistry;
 112 using ceph::ErasureCodeProfile;
 113 using ceph::Formatter;
 114 using ceph::JSONFormatter;
 115 using ceph::make_message;
 116
 117 #define dout_subsys ceph_subsys_mon
 118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
 119 static const string OSD_METADATA_PREFIX("osd_metadata");
 120 static const string OSD_SNAP_PREFIX("osd_snap");
 121
 122 /*
 123
 124   OSD snapshot metadata
 125   ---------------------
 126
 127   -- starting with mimic, removed in octopus --
 128
 129   "removed_epoch_%llu_%08lx" % (pool, epoch)
 130    -> interval_set<snapid_t>
 131
 132   "removed_snap_%llu_%016llx" % (pool, last_snap)
 133    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 134
 135
 136   -- starting with mimic --
 137
 138   "purged_snap_%llu_%016llx" % (pool, last_snap)
 139    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 140
 141   - note that the {removed,purged}_snap put the last snap in they key so
 142     that we can use forward iteration only to search for an epoch in an
 143     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 144     >= N that either does or doesn't contain the given snap.
 145
 146
 147   -- starting with octopus --
 148
 149   "purged_epoch_%08lx" % epoch
 150   -> map<int64_t,interval_set<snapid_t>>
 151
 152   */
 153 using namespace TOPNSPC::common;
 154 namespace {
 155
 156 struct OSDMemCache : public PriorityCache::PriCache {
 157   OSDMonitor *osdmon;
 158   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 159   int64_t committed_bytes = 0;
 160   double cache_ratio = 0;
 161
 162   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 163
 164   virtual uint64_t _get_used_bytes() const = 0;
 165
 166   virtual int64_t request_cache_bytes(
 167       PriorityCache::Priority pri, uint64_t total_cache) const {
 168     int64_t assigned = get_cache_bytes(pri);
 169
 170     switch (pri) {
 171     // All cache items are currently set to have PRI1 priority
 172     case PriorityCache::Priority::PRI1:
 173       {
 174         int64_t request = _get_used_bytes();
 175         return (request > assigned) ? request - assigned : 0;
 176       }
 177     default:
 178       break;
 179     }
 180     return -EOPNOTSUPP;
 181   }
 182
 183   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 184       return cache_bytes[pri];
 185   }
 186
 187   virtual int64_t get_cache_bytes() const {
 188     int64_t total = 0;
 189
 190     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 191       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 192       total += get_cache_bytes(pri);
 193     }
 194     return total;
 195   }
 196
 197   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 198     cache_bytes[pri] = bytes;
 199   }
 200   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 201     cache_bytes[pri] += bytes;
 202   }
 203   virtual int64_t commit_cache_size(uint64_t total_cache) {
 204     committed_bytes = PriorityCache::get_chunk(
 205         get_cache_bytes(), total_cache);
 206     return committed_bytes;
 207   }
 208   virtual int64_t get_committed_size() const {
 209     return committed_bytes;
 210   }
 211   virtual double get_cache_ratio() const {
 212     return cache_ratio;
 213   }
 214   virtual void set_cache_ratio(double ratio) {
 215     cache_ratio = ratio;
 216   }
 217   virtual void shift_bins() {
 218   }
 219   virtual void import_bins(const std::vector<uint64_t> &bins) {
 220   }
 221   virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
 222   }
 223   virtual uint64_t get_bins(PriorityCache::Priority pri) const {
 224     return 0;
 225   }
 226
 227   virtual string get_cache_name() const = 0;
 228 };
 229
 230 struct IncCache : public OSDMemCache {
 231   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 232
 233   virtual uint64_t _get_used_bytes() const {
 234     return osdmon->inc_osd_cache.get_bytes();
 235   }
 236
 237   virtual string get_cache_name() const {
 238     return "OSDMap Inc Cache";
 239   }
 240
 241   uint64_t _get_num_osdmaps() const {
 242     return osdmon->inc_osd_cache.get_size();
 243   }
 244 };
 245
 246 struct FullCache : public OSDMemCache {
 247   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 248
 249   virtual uint64_t _get_used_bytes() const {
 250     return osdmon->full_osd_cache.get_bytes();
 251   }
 252
 253   virtual string get_cache_name() const {
 254     return "OSDMap Full Cache";
 255   }
 256
 257   uint64_t _get_num_osdmaps() const {
 258     return osdmon->full_osd_cache.get_size();
 259   }
 260 };
 261
 262 std::shared_ptr<IncCache> inc_cache;
 263 std::shared_ptr<FullCache> full_cache;
 264
 265 const uint32_t MAX_POOL_APPLICATIONS = 4;
 266 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 267 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 268
 269 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 270   // Note: this doesn't include support for the application tag match
 271   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 272     auto& match = grant.match;
 273     if (match.is_match_all()) {
 274       return true;
 275     } else if (pool_name != nullptr &&
 276                !match.pool_namespace.pool_name.empty() &&
 277                match.pool_namespace.pool_name == *pool_name) {
 278       return true;
 279     }
 280   }
 281   return false;
 282 }
 283
 284 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 285                                     const KeyServer& key_server,
 286                                     const EntityName& entity_name,
 287                                     const MonCap& mon_caps,
 288                                     const entity_addr_t& peer_socket_addr,
 289                                     const std::string* pool_name)
 290 {
 291   typedef std::map<std::string, std::string> CommandArgs;
 292
 293   if (mon_caps.is_capable(
 294         cct, entity_name, "osd",
 295         "osd pool op unmanaged-snap",
 296         (pool_name == nullptr ?
 297          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 298          CommandArgs{{"poolname", *pool_name}}),
 299         false, true, false,
 300         peer_socket_addr)) {
 301     return true;
 302   }
 303
 304   AuthCapsInfo caps_info;
 305   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 306                                    caps_info)) {
 307     dout(10) << "unable to locate OSD cap data for " << entity_name
 308              << " in auth db" << dendl;
 309     return false;
 310   }
 311
 312   string caps_str;
 313   if (caps_info.caps.length() > 0) {
 314     auto p = caps_info.caps.cbegin();
 315     try {
 316       decode(caps_str, p);
 317     } catch (const ceph::buffer::error &err) {
 318       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 319            << dendl;
 320       return false;
 321     }
 322   }
 323
 324   OSDCap osd_cap;
 325   if (!osd_cap.parse(caps_str, nullptr)) {
 326     dout(10) << "unable to parse OSD cap data for " << entity_name
 327              << " in auth db" << dendl;
 328     return false;
 329   }
 330
 331   // if the entity has write permissions in one or all pools, permit
 332   // usage of unmanaged-snapshots
 333   if (osd_cap.allow_all()) {
 334     return true;
 335   }
 336
 337   for (auto& grant : osd_cap.grants) {
 338     if (grant.profile.is_valid()) {
 339       for (auto& profile_grant : grant.profile_grants) {
 340         if (is_osd_writable(profile_grant, pool_name)) {
 341           return true;
 342         }
 343       }
 344     } else if (is_osd_writable(grant, pool_name)) {
 345       return true;
 346     }
 347   }
 348
 349   return false;
 350 }
 351
 352 } // anonymous namespace
 353
 354 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
 355                                  epoch_t last_epoch_clean)
 356 {
 357   if (ps >= pg_num) {
 358     // removed PG
 359     return;
 360   }
 361   epoch_by_pg.resize(pg_num, 0);
 362   const auto old_lec = epoch_by_pg[ps];
 363   if (old_lec >= last_epoch_clean) {
 364     // stale lec
 365     return;
 366   }
 367   epoch_by_pg[ps] = last_epoch_clean;
 368   if (last_epoch_clean < floor) {
 369     floor = last_epoch_clean;
 370   } else if (last_epoch_clean > floor) {
 371     if (old_lec == floor) {
 372       // probably should increase floor?
 373       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 374                                         std::end(epoch_by_pg));
 375       floor = *new_floor;
 376     }
 377   }
 378   if (ps != next_missing) {
 379     return;
 380   }
 381   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 382     if (epoch_by_pg[next_missing] == 0) {
 383       break;
 384     }
 385   }
 386 }
 387
 388 void LastEpochClean::remove_pool(uint64_t pool)
 389 {
 390   report_by_pool.erase(pool);
 391 }
 392
 393 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
 394                             epoch_t last_epoch_clean)
 395 {
 396   auto& lec = report_by_pool[pg.pool()];
 397   return lec.report(pg_num, pg.ps(), last_epoch_clean);
 398 }
 399
 400 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 401 {
 402   auto floor = latest.get_epoch();
 403   for (auto& pool : latest.get_pools()) {
 404     auto reported = report_by_pool.find(pool.first);
 405     if (reported == report_by_pool.end()) {
 406       return 0;
 407     }
 408     if (reported->second.next_missing < pool.second.get_pg_num()) {
 409       return 0;
 410     }
 411     if (reported->second.floor < floor) {
 412       floor = reported->second.floor;
 413     }
 414   }
 415   return floor;
 416 }
 417
 418 void LastEpochClean::dump(Formatter *f) const
 419 {
 420   f->open_array_section("per_pool");
 421
 422   for (auto& [pool, lec] : report_by_pool) {
 423     f->open_object_section("pool");
 424     f->dump_unsigned("poolid", pool);
 425     f->dump_unsigned("floor", lec.floor);
 426     f->close_section();
 427   }
 428
 429   f->close_section();
 430 }
 431
 432 class C_UpdateCreatingPGs : public Context {
 433 public:
 434   OSDMonitor *osdmon;
 435   utime_t start;
 436   epoch_t epoch;
 437   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 438     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 439   void finish(int r) override {
 440     if (r >= 0) {
 441       utime_t end = ceph_clock_now();
 442       dout(10) << "osdmap epoch " << epoch << " mapping took "
 443                << (end - start) << " seconds" << dendl;
 444       osdmon->update_creating_pgs();
 445       osdmon->check_pg_creates_subs();
 446     }
 447   }
 448 };
 449
 450 #undef dout_prefix
 451 #define dout_prefix _prefix(_dout, mon, osdmap)
 452 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
 453   return *_dout << "mon." << mon.name << "@" << mon.rank
 454                 << "(" << mon.get_state_name()
 455                 << ").osd e" << osdmap.get_epoch() << " ";
 456 }
 457
 458 OSDMonitor::OSDMonitor(
 459   CephContext *cct,
 460   Monitor &mn,
 461   Paxos &p,
 462   const string& service_name)
 463  : PaxosService(mn, p, service_name),
 464    cct(cct),
 465    inc_osd_cache(g_conf()->mon_osd_cache_size),
 466    full_osd_cache(g_conf()->mon_osd_cache_size),
 467    has_osdmap_manifest(false),
 468    mapper(mn.cct, &mn.cpu_tp)
 469 {
 470   inc_cache = std::make_shared<IncCache>(this);
 471   full_cache = std::make_shared<FullCache>(this);
 472   cct->_conf.add_observer(this);
 473   int r = _set_cache_sizes();
 474   if (r < 0) {
 475     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 476          << g_conf()->mon_osd_cache_size
 477          << ") without priority cache management"
 478          << dendl;
 479   }
 480 }
 481
 482 const char **OSDMonitor::get_tracked_conf_keys() const
 483 {
 484   static const char* KEYS[] = {
 485     "mon_memory_target",
 486     "mon_memory_autotune",
 487     "rocksdb_cache_size",
 488     NULL
 489   };
 490   return KEYS;
 491 }
 492
 493 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 494                                     const std::set<std::string> &changed)
 495 {
 496   dout(10) << __func__ << " " << changed << dendl;
 497
 498   if (changed.count("mon_memory_autotune")) {
 499     _set_cache_autotuning();
 500   }
 501   if (changed.count("mon_memory_target") ||
 502       changed.count("rocksdb_cache_size")) {
 503     int r = _update_mon_cache_settings();
 504     if (r < 0) {
 505       derr << __func__ << " mon_memory_target:"
 506            << g_conf()->mon_memory_target
 507            << " rocksdb_cache_size:"
 508            << g_conf()->rocksdb_cache_size
 509            << ". Unable to update cache size."
 510            << dendl;
 511     }
 512   }
 513 }
 514
 515 void OSDMonitor::_set_cache_autotuning()
 516 {
 517   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 518     // Disable cache autotuning
 519     std::lock_guard l(balancer_lock);
 520     pcm = nullptr;
 521   }
 522
 523   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 524     int r = register_cache_with_pcm();
 525     if (r < 0) {
 526       dout(10) << __func__
 527                << " Error while registering osdmon caches with pcm."
 528                << " Cache auto tuning not enabled."
 529                << dendl;
 530       mon_memory_autotune = false;
 531     } else {
 532       mon_memory_autotune = true;
 533     }
 534   }
 535 }
 536
 537 int OSDMonitor::_update_mon_cache_settings()
 538 {
 539   if (g_conf()->mon_memory_target <= 0 ||
 540       g_conf()->mon_memory_target < mon_memory_min ||
 541       g_conf()->rocksdb_cache_size <= 0) {
 542     return -EINVAL;
 543   }
 544
 545   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 546     derr << __func__ << " not using pcm and rocksdb" << dendl;
 547     return -EINVAL;
 548   }
 549
 550   uint64_t old_mon_memory_target = mon_memory_target;
 551   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 552
 553   // Set the new pcm memory cache sizes
 554   mon_memory_target = g_conf()->mon_memory_target;
 555   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 556
 557   uint64_t base = mon_memory_base;
 558   double fragmentation = mon_memory_fragmentation;
 559   uint64_t target = mon_memory_target;
 560   uint64_t min = mon_memory_min;
 561   uint64_t max = min;
 562
 563   uint64_t ltarget = (1.0 - fragmentation) * target;
 564   if (ltarget > base + min) {
 565     max = ltarget - base;
 566   }
 567
 568   int r = _set_cache_ratios();
 569   if (r < 0) {
 570     derr << __func__ << " Cache ratios for pcm could not be set."
 571          << " Review the kv (rocksdb) and mon_memory_target sizes."
 572          << dendl;
 573     mon_memory_target = old_mon_memory_target;
 574     rocksdb_cache_size = old_rocksdb_cache_size;
 575     return -EINVAL;
 576   }
 577
 578   if (mon_memory_autotune && pcm != nullptr) {
 579     std::lock_guard l(balancer_lock);
 580     // set pcm cache levels
 581     pcm->set_target_memory(target);
 582     pcm->set_min_memory(min);
 583     pcm->set_max_memory(max);
 584     // tune memory based on new values
 585     pcm->tune_memory();
 586     pcm->balance();
 587     _set_new_cache_sizes();
 588     dout(1) << __func__ << " Updated mon cache setting."
 589              << " target: " << target
 590              << " min: " << min
 591              << " max: " << max
 592              << dendl;
 593   }
 594   return 0;
 595 }
 596
 597 int OSDMonitor::_set_cache_sizes()
 598 {
 599   if (g_conf()->mon_memory_autotune) {
 600     // set the new osdmon cache targets to be managed by pcm
 601     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 602     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 603     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 604     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 605     mon_memory_target = g_conf()->mon_memory_target;
 606     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 607     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 608       derr << __func__ << " mon_memory_target:" << mon_memory_target
 609            << " mon_memory_min:" << mon_memory_min
 610            << ". Invalid size option(s) provided."
 611            << dendl;
 612       return -EINVAL;
 613     }
 614     // Set the initial inc and full LRU cache sizes
 615     inc_osd_cache.set_bytes(mon_memory_min);
 616     full_osd_cache.set_bytes(mon_memory_min);
 617     mon_memory_autotune = g_conf()->mon_memory_autotune;
 618   }
 619   return 0;
 620 }
 621
 622 bool OSDMonitor::_have_pending_crush()
 623 {
 624   return pending_inc.crush.length() > 0;
 625 }
 626
 627 CrushWrapper &OSDMonitor::_get_stable_crush()
 628 {
 629   return *osdmap.crush;
 630 }
 631
 632 CrushWrapper OSDMonitor::_get_pending_crush()
 633 {
 634   bufferlist bl;
 635   if (pending_inc.crush.length())
 636     bl = pending_inc.crush;
 637   else
 638     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 639
 640   auto p = bl.cbegin();
 641   CrushWrapper crush;
 642   crush.decode(p);
 643   return crush;
 644 }
 645
 646 void OSDMonitor::create_initial()
 647 {
 648   dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
 649
 650   OSDMap newmap;
 651
 652   bufferlist bl;
 653   mon.store->get("mkfs", "osdmap", bl);
 654
 655   if (bl.length()) {
 656     newmap.decode(bl);
 657     newmap.set_fsid(mon.monmap->fsid);
 658   } else {
 659     newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
 660   }
 661   newmap.set_epoch(1);
 662   newmap.created = newmap.modified = ceph_clock_now();
 663
 664   // new clusters should sort bitwise by default.
 665   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 666
 667   newmap.flags |=
 668     CEPH_OSDMAP_RECOVERY_DELETES |
 669     CEPH_OSDMAP_PURGED_SNAPDIRS |
 670     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 671   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 672   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 673   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 674   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 675   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 676   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 677
 678   // new cluster should require latest by default
 679   if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
 680     if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
 681       derr << __func__ << " mon_debug_no_require_quincy and pacific=true" << dendl;
 682       newmap.require_osd_release = ceph_release_t::nautilus;
 683     } else {
 684       derr << __func__ << " mon_debug_no_require_quincy=true" << dendl;
 685       newmap.require_osd_release = ceph_release_t::pacific;
 686     }
 687   } else {
 688     newmap.require_osd_release = ceph_release_t::quincy;
 689   }
 690
 691   ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
 692   if (!r) {
 693     ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 694   }
 695   newmap.require_min_compat_client = r;
 696
 697   // encode into pending incremental
 698   uint64_t features = newmap.get_encoding_features();
 699   newmap.encode(pending_inc.fullmap,
 700                 features | CEPH_FEATURE_RESERVED);
 701   pending_inc.full_crc = newmap.get_crc();
 702   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 703 }
 704
 705 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 706 {
 707   s.insert(service_name);
 708   s.insert(OSD_PG_CREATING_PREFIX);
 709   s.insert(OSD_METADATA_PREFIX);
 710   s.insert(OSD_SNAP_PREFIX);
 711 }
 712
 713 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 714 {
 715   // we really don't care if the version has been updated, because we may
 716   // have trimmed without having increased the last committed; yet, we may
 717   // need to update the in-memory manifest.
 718   load_osdmap_manifest();
 719
 720   version_t version = get_last_committed();
 721   if (version == osdmap.epoch)
 722     return;
 723   ceph_assert(version > osdmap.epoch);
 724
 725   dout(15) << "update_from_paxos paxos e " << version
 726            << ", my e " << osdmap.epoch << dendl;
 727
 728   int prev_num_up_osd = osdmap.num_up_osd;
 729
 730   if (mapping_job) {
 731     if (!mapping_job->is_done()) {
 732       dout(1) << __func__ << " mapping job "
 733               << mapping_job.get() << " did not complete, "
 734               << mapping_job->shards << " left, canceling" << dendl;
 735       mapping_job->abort();
 736     }
 737     mapping_job.reset();
 738   }
 739
 740   load_health();
 741
 742   /*
 743    * We will possibly have a stashed latest that *we* wrote, and we will
 744    * always be sure to have the oldest full map in the first..last range
 745    * due to encode_trim_extra(), which includes the oldest full map in the trim
 746    * transaction.
 747    *
 748    * encode_trim_extra() does not however write the full map's
 749    * version to 'full_latest'.  This is only done when we are building the
 750    * full maps from the incremental versions.  But don't panic!  We make sure
 751    * that the following conditions find whichever full map version is newer.
 752    */
 753   version_t latest_full = get_version_latest_full();
 754   if (latest_full == 0 && get_first_committed() > 1)
 755     latest_full = get_first_committed();
 756
 757   if (get_first_committed() > 1 &&
 758       latest_full < get_first_committed()) {
 759     // the monitor could be just sync'ed with its peer, and the latest_full key
 760     // is not encoded in the paxos commits in encode_pending(), so we need to
 761     // make sure we get it pointing to a proper version.
 762     version_t lc = get_last_committed();
 763     version_t fc = get_first_committed();
 764
 765     dout(10) << __func__ << " looking for valid full map in interval"
 766              << " [" << fc << ", " << lc << "]" << dendl;
 767
 768     latest_full = 0;
 769     for (version_t v = lc; v >= fc; v--) {
 770       string full_key = "full_" + stringify(v);
 771       if (mon.store->exists(get_service_name(), full_key)) {
 772         dout(10) << __func__ << " found latest full map v " << v << dendl;
 773         latest_full = v;
 774         break;
 775       }
 776     }
 777
 778     ceph_assert(latest_full > 0);
 779     auto t(std::make_shared<MonitorDBStore::Transaction>());
 780     put_version_latest_full(t, latest_full);
 781     mon.store->apply_transaction(t);
 782     dout(10) << __func__ << " updated the on-disk full map version to "
 783              << latest_full << dendl;
 784   }
 785
 786   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 787     bufferlist latest_bl;
 788     get_version_full(latest_full, latest_bl);
 789     ceph_assert(latest_bl.length() != 0);
 790     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 791     osdmap = OSDMap();
 792     osdmap.decode(latest_bl);
 793   }
 794
 795   bufferlist bl;
 796   if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 797     auto p = bl.cbegin();
 798     std::lock_guard<std::mutex> l(creating_pgs_lock);
 799     creating_pgs.decode(p);
 800     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 801             << creating_pgs.last_scan_epoch
 802             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 803   } else {
 804     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 805             << dendl;
 806   }
 807
 808   // walk through incrementals
 809   MonitorDBStore::TransactionRef t;
 810   size_t tx_size = 0;
 811   while (version > osdmap.epoch) {
 812     bufferlist inc_bl;
 813     int err = get_version(osdmap.epoch+1, inc_bl);
 814     ceph_assert(err == 0);
 815     ceph_assert(inc_bl.length());
 816     // set priority cache manager levels if the osdmap is
 817     // being populated for the first time.
 818     if (mon_memory_autotune && pcm == nullptr) {
 819       int r = register_cache_with_pcm();
 820       if (r < 0) {
 821         dout(10) << __func__
 822                  << " Error while registering osdmon caches with pcm."
 823                  << " Proceeding without cache auto tuning."
 824                  << dendl;
 825       }
 826     }
 827
 828     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 829             << dendl;
 830     OSDMap::Incremental inc(inc_bl);
 831     err = osdmap.apply_incremental(inc);
 832     ceph_assert(err == 0);
 833
 834     if (!t)
 835       t.reset(new MonitorDBStore::Transaction);
 836
 837     // Write out the full map for all past epochs.  Encode the full
 838     // map with the same features as the incremental.  If we don't
 839     // know, use the quorum features.  If we don't know those either,
 840     // encode with all features.
 841     uint64_t f = inc.encode_features;
 842     if (!f)
 843       f = mon.get_quorum_con_features();
 844     if (!f)
 845       f = -1;
 846     bufferlist full_bl;
 847     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 848     tx_size += full_bl.length();
 849
 850     bufferlist orig_full_bl;
 851     get_version_full(osdmap.epoch, orig_full_bl);
 852     if (orig_full_bl.length()) {
 853       // the primary provided the full map
 854       ceph_assert(inc.have_crc);
 855       if (inc.full_crc != osdmap.crc) {
 856         // This will happen if the mons were running mixed versions in
 857         // the past or some other circumstance made the full encoded
 858         // maps divergent.  Reloading here will bring us back into
 859         // sync with the primary for this and all future maps.  OSDs
 860         // will also be brought back into sync when they discover the
 861         // crc mismatch and request a full map from a mon.
 862         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 863              << dendl;
 864
 865         dout(20) << __func__ << " my (bad) full osdmap:\n";
 866         JSONFormatter jf(true);
 867         jf.dump_object("osdmap", osdmap);
 868         jf.flush(*_dout);
 869         *_dout << "\nhexdump:\n";
 870         full_bl.hexdump(*_dout);
 871         *_dout << dendl;
 872
 873         osdmap = OSDMap();
 874         osdmap.decode(orig_full_bl);
 875
 876         dout(20) << __func__ << " canonical full osdmap:\n";
 877         JSONFormatter jf(true);
 878         jf.dump_object("osdmap", osdmap);
 879         jf.flush(*_dout);
 880         *_dout << "\nhexdump:\n";
 881         orig_full_bl.hexdump(*_dout);
 882         *_dout << dendl;
 883       }
 884     } else {
 885       ceph_assert(!inc.have_crc);
 886       put_version_full(t, osdmap.epoch, full_bl);
 887     }
 888     put_version_latest_full(t, osdmap.epoch);
 889
 890     // share
 891     dout(1) << osdmap << dendl;
 892
 893     if (osdmap.epoch == 1) {
 894       t->erase("mkfs", "osdmap");
 895     }
 896
 897     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 898       mon.store->apply_transaction(t);
 899       t = MonitorDBStore::TransactionRef();
 900       tx_size = 0;
 901     }
 902     for (const auto [osd, state] : inc.new_state) {
 903       if (state & CEPH_OSD_UP) {
 904         // could be marked up *or* down, but we're too lazy to check which
 905         last_osd_report.erase(osd);
 906       }
 907     }
 908     for (const auto [osd, weight] : inc.new_weight) {
 909       if (weight == CEPH_OSD_OUT) {
 910         // manually marked out, so drop it
 911         osd_epochs.erase(osd);
 912       }
 913     }
 914   }
 915
 916   if (t) {
 917     mon.store->apply_transaction(t);
 918   }
 919
 920   bool marked_osd_down = false;
 921   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 922     if (osdmap.is_out(o))
 923       continue;
 924     auto found = down_pending_out.find(o);
 925     if (osdmap.is_down(o)) {
 926       // populate down -> out map
 927       if (found == down_pending_out.end()) {
 928         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 929         down_pending_out[o] = ceph_clock_now();
 930         marked_osd_down = true;
 931       }
 932     } else {
 933       if (found != down_pending_out.end()) {
 934         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 935         down_pending_out.erase(found);
 936       }
 937     }
 938   }
 939   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 940
 941   check_osdmap_subs();
 942   check_pg_creates_subs();
 943
 944   share_map_with_random_osd();
 945   update_logger();
 946   process_failures();
 947
 948   // make sure our feature bits reflect the latest map
 949   update_msgr_features();
 950
 951   if (!mon.is_leader()) {
 952     // will be called by on_active() on the leader, avoid doing so twice
 953     start_mapping();
 954   }
 955   if (osdmap.stretch_mode_enabled) {
 956     dout(20) << "Stretch mode enabled in this map" << dendl;
 957     mon.try_engage_stretch_mode();
 958     if (osdmap.degraded_stretch_mode) {
 959       dout(20) << "Degraded stretch mode set in this map" << dendl;
 960       if (!osdmap.recovering_stretch_mode) {
 961         mon.set_degraded_stretch_mode();
 962         if (prev_num_up_osd < osdmap.num_up_osd &&
 963             (osdmap.num_up_osd / (double)osdmap.num_osd) >
 964             cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
 965           // TODO: This works for 2-site clusters when the OSD maps are appropriately
 966           // trimmed and everything is "normal" but not if you have a lot of out OSDs
 967           // you're ignoring or in some really degenerate failure cases
 968           dout(10) << "Enabling recovery stretch mode in this map" << dendl;
 969           mon.go_recovery_stretch_mode();
 970         }
 971       } else {
 972         mon.set_recovery_stretch_mode();
 973       }
 974     } else {
 975       mon.set_healthy_stretch_mode();
 976     }
 977     if (marked_osd_down &&
 978         (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
 979       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
 980       mon.maybe_go_degraded_stretch_mode();
 981     }
 982   }
 983 }
 984
 985 int OSDMonitor::register_cache_with_pcm()
 986 {
 987   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 988     derr << __func__ << " Invalid memory size specified for mon caches."
 989          << " Caches will not be auto-tuned."
 990          << dendl;
 991     return -EINVAL;
 992   }
 993   uint64_t base = mon_memory_base;
 994   double fragmentation = mon_memory_fragmentation;
 995   // For calculating total target memory, consider rocksdb cache size.
 996   uint64_t target = mon_memory_target;
 997   uint64_t min = mon_memory_min;
 998   uint64_t max = min;
 999
1000   // Apply the same logic as in bluestore to set the max amount
1001   // of memory to use for cache. Assume base memory for OSDMaps
1002   // and then add in some overhead for fragmentation.
1003   uint64_t ltarget = (1.0 - fragmentation) * target;
1004   if (ltarget > base + min) {
1005     max = ltarget - base;
1006   }
1007
1008   rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1009   if (!rocksdb_binned_kv_cache) {
1010     derr << __func__ << " not using rocksdb" << dendl;
1011     return -EINVAL;
1012   }
1013
1014   int r = _set_cache_ratios();
1015   if (r < 0) {
1016     derr << __func__ << " Cache ratios for pcm could not be set."
1017          << " Review the kv (rocksdb) and mon_memory_target sizes."
1018          << dendl;
1019     return -EINVAL;
1020   }
1021
1022   pcm = std::make_shared<PriorityCache::Manager>(
1023       cct, min, max, target, true);
1024   pcm->insert("kv", rocksdb_binned_kv_cache, true);
1025   pcm->insert("inc", inc_cache, true);
1026   pcm->insert("full", full_cache, true);
1027   dout(1) << __func__ << " pcm target: " << target
1028            << " pcm max: " << max
1029            << " pcm min: " << min
1030            << " inc_osd_cache size: " << inc_osd_cache.get_size()
1031            << dendl;
1032   return 0;
1033 }
1034
1035 int OSDMonitor::_set_cache_ratios()
1036 {
1037   double old_cache_kv_ratio = cache_kv_ratio;
1038
1039   // Set the cache ratios for kv(rocksdb), inc and full caches
1040   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1041   if (cache_kv_ratio >= 1.0) {
1042     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1043          << ") must be in range [0,<1.0]."
1044          << dendl;
1045     cache_kv_ratio = old_cache_kv_ratio;
1046     return -EINVAL;
1047   }
1048   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1049   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1050   inc_cache->set_cache_ratio(cache_inc_ratio);
1051   full_cache->set_cache_ratio(cache_full_ratio);
1052
1053   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1054            << " inc ratio " << cache_inc_ratio
1055            << " full ratio " << cache_full_ratio
1056            << dendl;
1057   return 0;
1058 }
1059
1060 void OSDMonitor::start_mapping()
1061 {
1062   // initiate mapping job
1063   if (mapping_job) {
1064     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1065              << dendl;
1066     mapping_job->abort();
1067   }
1068   if (!osdmap.get_pools().empty()) {
1069     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1070     mapping_job = mapping.start_update(osdmap, mapper,
1071                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
1072     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1073              << " at " << fin->start << dendl;
1074     mapping_job->set_finish_event(fin);
1075   } else {
1076     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1077     mapping_job = nullptr;
1078   }
1079 }
1080
1081 void OSDMonitor::update_msgr_features()
1082 {
1083   const int types[] = {
1084     entity_name_t::TYPE_OSD,
1085     entity_name_t::TYPE_CLIENT,
1086     entity_name_t::TYPE_MDS,
1087     entity_name_t::TYPE_MON
1088   };
1089   for (int type : types) {
1090     uint64_t mask;
1091     uint64_t features = osdmap.get_features(type, &mask);
1092     if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1093       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1094       ceph::net::Policy p = mon.messenger->get_policy(type);
1095       p.features_required = (p.features_required & ~mask) | features;
1096       mon.messenger->set_policy(type, p);
1097     }
1098   }
1099 }
1100
1101 void OSDMonitor::on_active()
1102 {
1103   update_logger();
1104
1105   if (mon.is_leader()) {
1106     mon.clog->debug() << "osdmap " << osdmap;
1107     if (!priority_convert) {
1108       // Only do this once at start-up
1109       convert_pool_priorities();
1110       priority_convert = true;
1111     }
1112   } else {
1113     list<MonOpRequestRef> ls;
1114     take_all_failures(ls);
1115     while (!ls.empty()) {
1116       MonOpRequestRef op = ls.front();
1117       op->mark_osdmon_event(__func__);
1118       dispatch(op);
1119       ls.pop_front();
1120     }
1121   }
1122   start_mapping();
1123 }
1124
1125 void OSDMonitor::on_restart()
1126 {
1127   last_osd_report.clear();
1128 }
1129
1130 void OSDMonitor::on_shutdown()
1131 {
1132   dout(10) << __func__ << dendl;
1133   if (mapping_job) {
1134     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1135              << dendl;
1136     mapping_job->abort();
1137   }
1138
1139   // discard failure info, waiters
1140   list<MonOpRequestRef> ls;
1141   take_all_failures(ls);
1142   ls.clear();
1143 }
1144
1145 void OSDMonitor::update_logger()
1146 {
1147   dout(10) << "update_logger" << dendl;
1148
1149   mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1150   mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1151   mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1152   mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1153 }
1154
1155 void OSDMonitor::create_pending()
1156 {
1157   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1158   pending_inc.fsid = mon.monmap->fsid;
1159   pending_metadata.clear();
1160   pending_metadata_rm.clear();
1161   pending_pseudo_purged_snaps.clear();
1162
1163   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1164
1165   // safety checks (this shouldn't really happen)
1166   {
1167     if (osdmap.backfillfull_ratio <= 0) {
1168       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1169       if (pending_inc.new_backfillfull_ratio > 1.0)
1170         pending_inc.new_backfillfull_ratio /= 100;
1171       dout(1) << __func__ << " setting backfillfull_ratio = "
1172               << pending_inc.new_backfillfull_ratio << dendl;
1173     }
1174     if (osdmap.full_ratio <= 0) {
1175       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1176       if (pending_inc.new_full_ratio > 1.0)
1177         pending_inc.new_full_ratio /= 100;
1178       dout(1) << __func__ << " setting full_ratio = "
1179               << pending_inc.new_full_ratio << dendl;
1180     }
1181     if (osdmap.nearfull_ratio <= 0) {
1182       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1183       if (pending_inc.new_nearfull_ratio > 1.0)
1184         pending_inc.new_nearfull_ratio /= 100;
1185       dout(1) << __func__ << " setting nearfull_ratio = "
1186               << pending_inc.new_nearfull_ratio << dendl;
1187     }
1188   }
1189 }
1190
1191 creating_pgs_t
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1193                                const OSDMap& nextmap)
1194 {
1195   dout(10) << __func__ << dendl;
1196   creating_pgs_t pending_creatings;
1197   {
1198     std::lock_guard<std::mutex> l(creating_pgs_lock);
1199     pending_creatings = creating_pgs;
1200   }
1201   // check for new or old pools
1202   if (pending_creatings.last_scan_epoch < inc.epoch) {
1203     unsigned queued = 0;
1204     queued += scan_for_creating_pgs(osdmap.get_pools(),
1205                                     inc.old_pools,
1206                                     inc.modified,
1207                                     &pending_creatings);
1208     queued += scan_for_creating_pgs(inc.new_pools,
1209                                     inc.old_pools,
1210                                     inc.modified,
1211                                     &pending_creatings);
1212     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1213     for (auto deleted_pool : inc.old_pools) {
1214       auto removed = pending_creatings.remove_pool(deleted_pool);
1215       dout(10) << __func__ << " " << removed
1216                << " pg removed because containing pool deleted: "
1217                << deleted_pool << dendl;
1218       last_epoch_clean.remove_pool(deleted_pool);
1219     }
1220     // pgmon updates its creating_pgs in check_osd_map() which is called by
1221     // on_active() and check_osd_map() could be delayed if lease expires, so its
1222     // creating_pgs could be stale in comparison with the one of osdmon. let's
1223     // trim them here. otherwise, they will be added back after being erased.
1224     unsigned removed = 0;
1225     for (auto& pg : pending_created_pgs) {
1226       dout(20) << __func__ << " noting created pg " << pg << dendl;
1227       pending_creatings.created_pools.insert(pg.pool());
1228       removed += pending_creatings.pgs.erase(pg);
1229     }
1230     pending_created_pgs.clear();
1231     dout(10) << __func__ << " " << removed
1232              << " pgs removed because they're created" << dendl;
1233     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1234   }
1235
1236   // filter out any pgs that shouldn't exist.
1237   {
1238     auto i = pending_creatings.pgs.begin();
1239     while (i != pending_creatings.pgs.end()) {
1240       if (!nextmap.pg_exists(i->first)) {
1241         dout(10) << __func__ << " removing pg " << i->first
1242                  << " which should not exist" << dendl;
1243         i = pending_creatings.pgs.erase(i);
1244       } else {
1245         ++i;
1246       }
1247     }
1248   }
1249
1250   // process queue
1251   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1252   const auto total = pending_creatings.pgs.size();
1253   while (pending_creatings.pgs.size() < max &&
1254          !pending_creatings.queue.empty()) {
1255     auto p = pending_creatings.queue.begin();
1256     int64_t poolid = p->first;
1257     dout(10) << __func__ << " pool " << poolid
1258              << " created " << p->second.created
1259              << " modified " << p->second.modified
1260              << " [" << p->second.start << "-" << p->second.end << ")"
1261              << dendl;
1262     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1263                                   p->second.end - p->second.start);
1264     ps_t first = p->second.start;
1265     ps_t end = first + n;
1266     for (ps_t ps = first; ps < end; ++ps) {
1267       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1268       // NOTE: use the *current* epoch as the PG creation epoch so that the
1269       // OSD does not have to generate a long set of PastIntervals.
1270       pending_creatings.pgs.emplace(
1271         pgid,
1272         creating_pgs_t::pg_create_info(inc.epoch,
1273                                        p->second.modified));
1274       dout(10) << __func__ << " adding " << pgid << dendl;
1275     }
1276     p->second.start = end;
1277     if (p->second.done()) {
1278       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1279       pending_creatings.queue.erase(p);
1280     } else {
1281       dout(10) << __func__ << " pool " << poolid
1282                << " now [" << p->second.start << "-" << p->second.end << ")"
1283                << dendl;
1284     }
1285   }
1286   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1287            << " pools" << dendl;
1288
1289   if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1290     // walk creating pgs' history and past_intervals forward
1291     for (auto& i : pending_creatings.pgs) {
1292       // this mirrors PG::start_peering_interval()
1293       pg_t pgid = i.first;
1294
1295       // this is a bit imprecise, but sufficient?
1296       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1297         const pg_pool_t *pi;
1298         bool operator()(const set<pg_shard_t> &have) const {
1299           return have.size() >= pi->min_size;
1300         }
1301         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1302       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1303
1304       vector<int> up, acting;
1305       int up_primary, acting_primary;
1306       nextmap.pg_to_up_acting_osds(
1307         pgid, &up, &up_primary, &acting, &acting_primary);
1308       if (i.second.history.epoch_created == 0) {
1309         // new pg entry, set it up
1310         i.second.up = up;
1311         i.second.acting = acting;
1312         i.second.up_primary = up_primary;
1313         i.second.acting_primary = acting_primary;
1314         i.second.history = pg_history_t(i.second.create_epoch,
1315                                         i.second.create_stamp);
1316         dout(10) << __func__ << "  pg " << pgid << " just added, "
1317                  << " up " << i.second.up
1318                  << " p " << i.second.up_primary
1319                  << " acting " << i.second.acting
1320                  << " p " << i.second.acting_primary
1321                  << " history " << i.second.history
1322                  << " past_intervals " << i.second.past_intervals
1323                  << dendl;
1324      } else {
1325         std::stringstream debug;
1326         if (PastIntervals::check_new_interval(
1327               i.second.acting_primary, acting_primary,
1328               i.second.acting, acting,
1329               i.second.up_primary, up_primary,
1330               i.second.up, up,
1331               i.second.history.same_interval_since,
1332               i.second.history.last_epoch_clean,
1333               &nextmap,
1334               &osdmap,
1335               pgid,
1336               min_size_predicate,
1337               &i.second.past_intervals,
1338               &debug)) {
1339           epoch_t e = inc.epoch;
1340           i.second.history.same_interval_since = e;
1341           if (i.second.up != up) {
1342             i.second.history.same_up_since = e;
1343           }
1344           if (i.second.acting_primary != acting_primary) {
1345             i.second.history.same_primary_since = e;
1346           }
1347           if (pgid.is_split(
1348                 osdmap.get_pg_num(pgid.pool()),
1349                 nextmap.get_pg_num(pgid.pool()),
1350                 nullptr)) {
1351             i.second.history.last_epoch_split = e;
1352           }
1353           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1354                    << " up " << i.second.up << " -> " << up
1355                    << " p " << i.second.up_primary << " -> " << up_primary
1356                    << " acting " << i.second.acting << " -> " << acting
1357                    << " p " << i.second.acting_primary << " -> "
1358                    << acting_primary
1359                    << " history " << i.second.history
1360                    << " past_intervals " << i.second.past_intervals
1361                    << dendl;
1362           dout(20) << "  debug: " << debug.str() << dendl;
1363           i.second.up = up;
1364           i.second.acting = acting;
1365           i.second.up_primary = up_primary;
1366           i.second.acting_primary = acting_primary;
1367         }
1368       }
1369     }
1370   }
1371   dout(10) << __func__
1372            << " " << (pending_creatings.pgs.size() - total)
1373            << "/" << pending_creatings.pgs.size()
1374            << " pgs added from queued pools" << dendl;
1375   return pending_creatings;
1376 }
1377
1378 void OSDMonitor::maybe_prime_pg_temp()
1379 {
1380   bool all = false;
1381   if (pending_inc.crush.length()) {
1382     dout(10) << __func__ << " new crush map, all" << dendl;
1383     all = true;
1384   }
1385
1386   if (!pending_inc.new_up_client.empty()) {
1387     dout(10) << __func__ << " new up osds, all" << dendl;
1388     all = true;
1389   }
1390
1391   // check for interesting OSDs
1392   set<int> osds;
1393   for (auto p = pending_inc.new_state.begin();
1394        !all && p != pending_inc.new_state.end();
1395        ++p) {
1396     if ((p->second & CEPH_OSD_UP) &&
1397         osdmap.is_up(p->first)) {
1398       osds.insert(p->first);
1399     }
1400   }
1401   for (auto p = pending_inc.new_weight.begin();
1402        !all && p != pending_inc.new_weight.end();
1403        ++p) {
1404     if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1405       // weight reduction
1406       osds.insert(p->first);
1407     } else {
1408       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1409                << dendl;
1410       all = true;
1411     }
1412   }
1413
1414   if (!all && osds.empty())
1415     return;
1416
1417   if (!all) {
1418     unsigned estimate =
1419       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1420     if (estimate > mapping.get_num_pgs() *
1421         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1422       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1423                << osds.size() << " osds >= "
1424                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1425                << mapping.get_num_pgs() << " pgs, all"
1426                << dendl;
1427       all = true;
1428     } else {
1429       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1430                << osds.size() << " osds" << dendl;
1431     }
1432   }
1433
1434   OSDMap next;
1435   next.deepish_copy_from(osdmap);
1436   next.apply_incremental(pending_inc);
1437
1438   if (next.get_pools().empty()) {
1439     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1440   } else if (all) {
1441     PrimeTempJob job(next, this);
1442     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1443     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1444       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1445     } else {
1446       dout(10) << __func__ << " did not finish in "
1447                << g_conf()->mon_osd_prime_pg_temp_max_time
1448                << ", stopping" << dendl;
1449       job.abort();
1450     }
1451   } else {
1452     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1453     utime_t stop = ceph_clock_now();
1454     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1455     const int chunk = 1000;
1456     int n = chunk;
1457     std::unordered_set<pg_t> did_pgs;
1458     for (auto osd : osds) {
1459       auto& pgs = mapping.get_osd_acting_pgs(osd);
1460       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1461       for (auto pgid : pgs) {
1462         if (!did_pgs.insert(pgid).second) {
1463           continue;
1464         }
1465         prime_pg_temp(next, pgid);
1466         if (--n <= 0) {
1467           n = chunk;
1468           if (ceph_clock_now() > stop) {
1469             dout(10) << __func__ << " consumed more than "
1470                      << g_conf()->mon_osd_prime_pg_temp_max_time
1471                      << " seconds, stopping"
1472                      << dendl;
1473             return;
1474           }
1475         }
1476       }
1477     }
1478   }
1479 }
1480
1481 void OSDMonitor::prime_pg_temp(
1482   const OSDMap& next,
1483   pg_t pgid)
1484 {
1485   // TODO: remove this creating_pgs direct access?
1486   if (creating_pgs.pgs.count(pgid)) {
1487     return;
1488   }
1489   if (!osdmap.pg_exists(pgid)) {
1490     return;
1491   }
1492
1493   vector<int> up, acting;
1494   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1495
1496   vector<int> next_up, next_acting;
1497   int next_up_primary, next_acting_primary;
1498   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1499                             &next_acting, &next_acting_primary);
1500   if (acting == next_acting &&
1501       !(up != acting && next_up == next_acting))
1502     return;  // no change since last epoch
1503
1504   if (acting.empty())
1505     return;  // if previously empty now we can be no worse off
1506   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1507   if (pool && acting.size() < pool->min_size)
1508     return;  // can be no worse off than before
1509
1510   if (next_up == next_acting) {
1511     acting.clear();
1512     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1513              << dendl;
1514   }
1515
1516   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1517            << " -> " << next_up << "/" << next_acting
1518            << ", priming " << acting
1519            << dendl;
1520   {
1521     std::lock_guard l(prime_pg_temp_lock);
1522     // do not touch a mapping if a change is pending
1523     pending_inc.new_pg_temp.emplace(
1524       pgid,
1525       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1526   }
1527 }
1528
1529 /**
1530  * @note receiving a transaction in this function gives a fair amount of
1531  * freedom to the service implementation if it does need it. It shouldn't.
1532  */
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1534 {
1535   dout(10) << "encode_pending e " << pending_inc.epoch
1536            << dendl;
1537
1538   if (do_prune(t)) {
1539     dout(1) << __func__ << " osdmap full prune encoded e"
1540             << pending_inc.epoch << dendl;
1541   }
1542
1543   // finalize up pending_inc
1544   pending_inc.modified = ceph_clock_now();
1545
1546   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1547   ceph_assert(r == 0);
1548
1549   if (mapping_job) {
1550     if (!mapping_job->is_done()) {
1551       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1552               << mapping_job.get() << " did not complete, "
1553               << mapping_job->shards << " left" << dendl;
1554       mapping_job->abort();
1555     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1556       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1557               << mapping_job.get() << " is prior epoch "
1558               << mapping.get_epoch() << dendl;
1559     } else {
1560       if (g_conf()->mon_osd_prime_pg_temp) {
1561         maybe_prime_pg_temp();
1562       }
1563     }
1564   } else if (g_conf()->mon_osd_prime_pg_temp) {
1565     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1566             << dendl;
1567   }
1568   mapping_job.reset();
1569
1570   // ensure we don't have blank new_state updates.  these are interrpeted as
1571   // CEPH_OSD_UP (and almost certainly not what we want!).
1572   auto p = pending_inc.new_state.begin();
1573   while (p != pending_inc.new_state.end()) {
1574     if (p->second == 0) {
1575       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1576       p = pending_inc.new_state.erase(p);
1577     } else {
1578       if (p->second & CEPH_OSD_UP) {
1579         pending_inc.new_last_up_change = pending_inc.modified;
1580       }
1581       ++p;
1582     }
1583   }
1584   if (!pending_inc.new_up_client.empty()) {
1585     pending_inc.new_last_up_change = pending_inc.modified;
1586   }
1587   for (auto& i : pending_inc.new_weight) {
1588     if (i.first >= osdmap.max_osd) {
1589       if (i.second) {
1590         // new osd is already marked in
1591         pending_inc.new_last_in_change = pending_inc.modified;
1592         break;
1593       }
1594     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1595       // existing osd marked in or out
1596       pending_inc.new_last_in_change = pending_inc.modified;
1597       break;
1598     }
1599   }
1600
1601   {
1602     OSDMap tmp;
1603     tmp.deepish_copy_from(osdmap);
1604     tmp.apply_incremental(pending_inc);
1605
1606     // clean pg_temp mappings
1607     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1608
1609     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1610     {
1611       // check every upmapped pg for now
1612       // until we could reliably identify certain cases to ignore,
1613       // which is obviously the hard part TBD..
1614       vector<pg_t> pgs_to_check;
1615       tmp.get_upmap_pgs(&pgs_to_check);
1616       if (pgs_to_check.size() <
1617           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1618         // not enough pgs, do it inline
1619         tmp.clean_pg_upmaps(cct, &pending_inc);
1620       } else {
1621         CleanUpmapJob job(cct, tmp, pending_inc);
1622         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1623         job.wait();
1624       }
1625     }
1626
1627     // update creating pgs first so that we can remove the created pgid and
1628     // process the pool flag removal below in the same osdmap epoch.
1629     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1630     bufferlist creatings_bl;
1631     uint64_t features = CEPH_FEATURES_ALL;
1632     if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1633       dout(20) << __func__ << " encoding pending pgs without octopus features"
1634                << dendl;
1635       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1636     }
1637     encode(pending_creatings, creatings_bl, features);
1638     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1639
1640     // remove any old (or incompat) POOL_CREATING flags
1641     for (auto& i : tmp.get_pools()) {
1642       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1643         // pre-nautilus OSDMaps shouldn't get this flag.
1644         if (pending_inc.new_pools.count(i.first)) {
1645           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1646         }
1647       }
1648       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1649           !pending_creatings.still_creating_pool(i.first)) {
1650         dout(10) << __func__ << " done creating pool " << i.first
1651                  << ", clearing CREATING flag" << dendl;
1652         if (pending_inc.new_pools.count(i.first) == 0) {
1653           pending_inc.new_pools[i.first] = i.second;
1654         }
1655         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1656       }
1657     }
1658
1659     // collect which pools are currently affected by
1660     // the near/backfill/full osd(s),
1661     // and set per-pool near/backfill/full flag instead
1662     set<int64_t> full_pool_ids;
1663     set<int64_t> backfillfull_pool_ids;
1664     set<int64_t> nearfull_pool_ids;
1665     tmp.get_full_pools(cct,
1666                        &full_pool_ids,
1667                        &backfillfull_pool_ids,
1668                          &nearfull_pool_ids);
1669     if (full_pool_ids.empty() ||
1670         backfillfull_pool_ids.empty() ||
1671         nearfull_pool_ids.empty()) {
1672       // normal case - no nearfull, backfillfull or full osds
1673         // try cancel any improper nearfull/backfillfull/full pool
1674         // flags first
1675       for (auto &pool: tmp.get_pools()) {
1676         auto p = pool.first;
1677         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1678             nearfull_pool_ids.empty()) {
1679           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1680                    << "'s nearfull flag" << dendl;
1681           if (pending_inc.new_pools.count(p) == 0) {
1682             // load original pool info first!
1683             pending_inc.new_pools[p] = pool.second;
1684           }
1685           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1686         }
1687         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1688             backfillfull_pool_ids.empty()) {
1689           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1690                    << "'s backfillfull flag" << dendl;
1691           if (pending_inc.new_pools.count(p) == 0) {
1692             pending_inc.new_pools[p] = pool.second;
1693           }
1694           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1695         }
1696         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1697             full_pool_ids.empty()) {
1698           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1699             // set by EQUOTA, skipping
1700             continue;
1701           }
1702           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1703                    << "'s full flag" << dendl;
1704           if (pending_inc.new_pools.count(p) == 0) {
1705             pending_inc.new_pools[p] = pool.second;
1706           }
1707           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1708         }
1709       }
1710     }
1711     if (!full_pool_ids.empty()) {
1712       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1713                << " as full" << dendl;
1714       for (auto &p: full_pool_ids) {
1715         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1716           continue;
1717         }
1718         if (pending_inc.new_pools.count(p) == 0) {
1719           pending_inc.new_pools[p] = tmp.pools[p];
1720         }
1721         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1722         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1723         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1724       }
1725       // cancel FLAG_FULL for pools which are no longer full too
1726       for (auto &pool: tmp.get_pools()) {
1727         auto p = pool.first;
1728         if (full_pool_ids.count(p)) {
1729           // skip pools we have just marked as full above
1730           continue;
1731         }
1732         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1733             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1734           // don't touch if currently is not full
1735           // or is running out of quota (and hence considered as full)
1736           continue;
1737         }
1738         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1739                  << "'s full flag" << dendl;
1740         if (pending_inc.new_pools.count(p) == 0) {
1741           pending_inc.new_pools[p] = pool.second;
1742         }
1743         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1744       }
1745     }
1746     if (!backfillfull_pool_ids.empty()) {
1747       for (auto &p: backfillfull_pool_ids) {
1748         if (full_pool_ids.count(p)) {
1749           // skip pools we have already considered as full above
1750           continue;
1751         }
1752         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1753           // make sure FLAG_FULL is truly set, so we are safe not
1754           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1756           continue;
1757         }
1758         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1759           // don't bother if pool is already marked as backfillfull
1760           continue;
1761         }
1762         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1763                  << "'s as backfillfull" << dendl;
1764         if (pending_inc.new_pools.count(p) == 0) {
1765           pending_inc.new_pools[p] = tmp.pools[p];
1766         }
1767         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1768         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1769       }
1770       // cancel FLAG_BACKFILLFULL for pools
1771       // which are no longer backfillfull too
1772       for (auto &pool: tmp.get_pools()) {
1773         auto p = pool.first;
1774         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1775           // skip pools we have just marked as backfillfull/full above
1776           continue;
1777         }
1778         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1779           // and don't touch if currently is not backfillfull
1780           continue;
1781         }
1782         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1783                  << "'s backfillfull flag" << dendl;
1784         if (pending_inc.new_pools.count(p) == 0) {
1785           pending_inc.new_pools[p] = pool.second;
1786         }
1787         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1788       }
1789     }
1790     if (!nearfull_pool_ids.empty()) {
1791       for (auto &p: nearfull_pool_ids) {
1792         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1793           continue;
1794         }
1795         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1796           // make sure FLAG_FULL is truly set, so we are safe not
1797           // to set a extra (redundant) FLAG_NEARFULL flag
1798           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1799           continue;
1800         }
1801         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1802           // don't bother if pool is already marked as nearfull
1803           continue;
1804         }
1805         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1806                  << "'s as nearfull" << dendl;
1807         if (pending_inc.new_pools.count(p) == 0) {
1808           pending_inc.new_pools[p] = tmp.pools[p];
1809         }
1810         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1811       }
1812       // cancel FLAG_NEARFULL for pools
1813       // which are no longer nearfull too
1814       for (auto &pool: tmp.get_pools()) {
1815         auto p = pool.first;
1816         if (full_pool_ids.count(p) ||
1817             backfillfull_pool_ids.count(p) ||
1818             nearfull_pool_ids.count(p)) {
1819           // skip pools we have just marked as
1820           // nearfull/backfillfull/full above
1821           continue;
1822         }
1823         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1824           // and don't touch if currently is not nearfull
1825           continue;
1826         }
1827         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1828                  << "'s nearfull flag" << dendl;
1829         if (pending_inc.new_pools.count(p) == 0) {
1830           pending_inc.new_pools[p] = pool.second;
1831         }
1832         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1833       }
1834     }
1835
1836     // min_compat_client?
1837     if (!tmp.require_min_compat_client) {
1838       auto mv = tmp.get_min_compat_client();
1839       dout(1) << __func__ << " setting require_min_compat_client to currently "
1840               << "required " << mv << dendl;
1841       mon.clog->info() << "setting require_min_compat_client to currently "
1842                         << "required " << mv;
1843       pending_inc.new_require_min_compat_client = mv;
1844     }
1845
1846     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1847         tmp.require_osd_release >= ceph_release_t::nautilus) {
1848       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1849       // add creating flags?
1850       for (auto& i : tmp.get_pools()) {
1851         if (pending_creatings.still_creating_pool(i.first)) {
1852           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1853                    << dendl;
1854           if (pending_inc.new_pools.count(i.first) == 0) {
1855             pending_inc.new_pools[i.first] = i.second;
1856           }
1857           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1858         }
1859       }
1860       // adjust blocklist items to all be TYPE_ANY
1861       for (auto& i : tmp.blocklist) {
1862         auto a = i.first;
1863         a.set_type(entity_addr_t::TYPE_ANY);
1864         pending_inc.new_blocklist[a] = i.second;
1865         pending_inc.old_blocklist.push_back(i.first);
1866       }
1867     }
1868
1869     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1870         tmp.require_osd_release >= ceph_release_t::octopus) {
1871       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1872
1873       // adjust obsoleted cache modes
1874       for (auto& [poolid, pi] : tmp.pools) {
1875         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1876           if (pending_inc.new_pools.count(poolid) == 0) {
1877             pending_inc.new_pools[poolid] = pi;
1878           }
1879           dout(10) << __func__ << " switching pool " << poolid
1880                    << " cachemode from forward -> proxy" << dendl;
1881           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1882         }
1883         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1884           if (pending_inc.new_pools.count(poolid) == 0) {
1885             pending_inc.new_pools[poolid] = pi;
1886           }
1887           dout(10) << __func__ << " switching pool " << poolid
1888                    << " cachemode from readforward -> readproxy" << dendl;
1889           pending_inc.new_pools[poolid].cache_mode =
1890             pg_pool_t::CACHEMODE_READPROXY;
1891         }
1892       }
1893
1894       // clear removed_snaps for every pool
1895       for (auto& [poolid, pi] : tmp.pools) {
1896         if (pi.removed_snaps.empty()) {
1897           continue;
1898         }
1899         if (pending_inc.new_pools.count(poolid) == 0) {
1900           pending_inc.new_pools[poolid] = pi;
1901         }
1902         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1903                  << dendl;
1904         pending_inc.new_pools[poolid].removed_snaps.clear();
1905       }
1906
1907       // create a combined purged snap epoch key for all purged snaps
1908       // prior to this epoch, and store it in the current epoch (i.e.,
1909       // the last pre-octopus epoch, just prior to the one we're
1910       // encoding now).
1911       auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1912       it->lower_bound("purged_snap_");
1913       map<int64_t,snap_interval_set_t> combined;
1914       while (it->valid()) {
1915         if (it->key().find("purged_snap_") != 0) {
1916           break;
1917         }
1918         string k = it->key();
1919         long long unsigned pool;
1920         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1921         if (n != 1) {
1922           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1923         } else {
1924           bufferlist v = it->value();
1925           auto p = v.cbegin();
1926           snapid_t begin, end;
1927           ceph::decode(begin, p);
1928           ceph::decode(end, p);
1929           combined[pool].insert(begin, end - begin);
1930         }
1931         it->next();
1932       }
1933       if (!combined.empty()) {
1934         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1935         bufferlist v;
1936         ceph::encode(combined, v);
1937         t->put(OSD_SNAP_PREFIX, k, v);
1938         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1939                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1940                  << dendl;
1941       } else {
1942         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1943                  << dendl;
1944       }
1945
1946       // clean out the old removed_snap_ and removed_epoch keys
1947       // ('`' is ASCII '_' + 1)
1948       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1949       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1950     }
1951   }
1952
1953   // tell me about it
1954   for (auto i = pending_inc.new_state.begin();
1955        i != pending_inc.new_state.end();
1956        ++i) {
1957     int s = i->second ? i->second : CEPH_OSD_UP;
1958     if (s & CEPH_OSD_UP) {
1959       dout(2) << " osd." << i->first << " DOWN" << dendl;
1960       // Reset laggy parameters if failure interval exceeds a threshold.
1961       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1962       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1963         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1964         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1965           set_default_laggy_params(i->first);
1966         }
1967       }
1968     }
1969     if (s & CEPH_OSD_EXISTS)
1970       dout(2) << " osd." << i->first << " DNE" << dendl;
1971   }
1972   for (auto i = pending_inc.new_up_client.begin();
1973        i != pending_inc.new_up_client.end();
1974        ++i) {
1975     //FIXME: insert cluster addresses too
1976     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1977   }
1978   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1979        i != pending_inc.new_weight.end();
1980        ++i) {
1981     if (i->second == CEPH_OSD_OUT) {
1982       dout(2) << " osd." << i->first << " OUT" << dendl;
1983     } else if (i->second == CEPH_OSD_IN) {
1984       dout(2) << " osd." << i->first << " IN" << dendl;
1985     } else {
1986       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1987     }
1988   }
1989
1990   // features for osdmap and its incremental
1991   uint64_t features;
1992
1993   // encode full map and determine its crc
1994   OSDMap tmp;
1995   {
1996     tmp.deepish_copy_from(osdmap);
1997     tmp.apply_incremental(pending_inc);
1998
1999     // determine appropriate features
2000     features = tmp.get_encoding_features();
2001     dout(10) << __func__ << " encoding full map with "
2002              << tmp.require_osd_release
2003              << " features " << features << dendl;
2004
2005     // the features should be a subset of the mon quorum's features!
2006     ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2007
2008     bufferlist fullbl;
2009     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2010     pending_inc.full_crc = tmp.get_crc();
2011
2012     // include full map in the txn.  note that old monitors will
2013     // overwrite this.  new ones will now skip the local full map
2014     // encode and reload from this.
2015     put_version_full(t, pending_inc.epoch, fullbl);
2016   }
2017
2018   // encode
2019   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2020   bufferlist bl;
2021   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2022
2023   dout(20) << " full_crc " << tmp.get_crc()
2024            << " inc_crc " << pending_inc.inc_crc << dendl;
2025
2026   /* put everything in the transaction */
2027   put_version(t, pending_inc.epoch, bl);
2028   put_last_committed(t, pending_inc.epoch);
2029
2030   // metadata, too!
2031   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2032        p != pending_metadata.end();
2033        ++p) {
2034     Metadata m;
2035     auto mp = p->second.cbegin();
2036     decode(m, mp);
2037     auto it = m.find("osd_objectstore");
2038     if (it != m.end()) {
2039       if (it->second == "filestore") {
2040         filestore_osds.insert(p->first);
2041       } else {
2042         filestore_osds.erase(p->first);
2043       }
2044     }
2045     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2046   }
2047   for (set<int>::iterator p = pending_metadata_rm.begin();
2048        p != pending_metadata_rm.end();
2049        ++p) {
2050     filestore_osds.erase(*p);
2051     t->erase(OSD_METADATA_PREFIX, stringify(*p));
2052   }
2053   pending_metadata.clear();
2054   pending_metadata_rm.clear();
2055
2056   // purged_snaps
2057   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2058       !pending_inc.new_purged_snaps.empty()) {
2059     // all snaps purged this epoch (across all pools)
2060     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2061     bufferlist v;
2062     encode(pending_inc.new_purged_snaps, v);
2063     t->put(OSD_SNAP_PREFIX, k, v);
2064   }
2065   for (auto& i : pending_inc.new_purged_snaps) {
2066     for (auto q = i.second.begin();
2067          q != i.second.end();
2068          ++q) {
2069       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2070                                 pending_inc.epoch,
2071                                 t);
2072     }
2073   }
2074   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2075     for (auto snap : snaps) {
2076       insert_purged_snap_update(pool, snap, snap + 1,
2077                                 pending_inc.epoch,
2078                                 t);
2079     }
2080   }
2081
2082   // health
2083   health_check_map_t next;
2084   tmp.check_health(cct, &next);
2085   // OSD_FILESTORE
2086   check_for_filestore_osds(&next);
2087   encode_health(next, t);
2088 }
2089
2090 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2091 {
2092   bufferlist bl;
2093   int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2094   if (r < 0)
2095     return r;
2096   try {
2097     auto p = bl.cbegin();
2098     decode(m, p);
2099   }
2100   catch (ceph::buffer::error& e) {
2101     if (err)
2102       *err << "osd." << osd << " metadata is corrupt";
2103     return -EIO;
2104   }
2105   return 0;
2106 }
2107
2108 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2109 {
2110   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2111     if (osdmap.is_up(osd)) {
2112       map<string,string> meta;
2113       load_metadata(osd, meta, nullptr);
2114       auto p = meta.find(field);
2115       if (p == meta.end()) {
2116         (*out)["unknown"]++;
2117       } else {
2118         (*out)[p->second]++;
2119       }
2120     }
2121   }
2122 }
2123
2124 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2125 {
2126   map<string,int> by_val;
2127   count_metadata(field, &by_val);
2128   f->open_object_section(field.c_str());
2129   for (auto& p : by_val) {
2130     f->dump_int(p.first.c_str(), p.second);
2131   }
2132   f->close_section();
2133 }
2134
2135 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2136 {
2137   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2138     if (osdmap.is_up(osd)) {
2139       map<string,string> meta;
2140       load_metadata(osd, meta, nullptr);
2141       auto p = meta.find("ceph_version_short");
2142       if (p == meta.end()) continue;
2143       versions[p->second].push_back(string("osd.") + stringify(osd));
2144     }
2145   }
2146 }
2147
2148 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2149 {
2150   map<string, string> metadata;
2151   int r = load_metadata(osd, metadata, nullptr);
2152   if (r < 0)
2153     return r;
2154
2155   auto it = metadata.find("osd_objectstore");
2156   if (it == metadata.end())
2157     return -ENOENT;
2158   *type = it->second;
2159   return 0;
2160 }
2161
2162 void OSDMonitor::get_filestore_osd_list()
2163 {
2164   for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2165     string objectstore_type;
2166     int r = get_osd_objectstore_type(osd, &objectstore_type);
2167     if (r == 0 && objectstore_type == "filestore") {
2168       filestore_osds.insert(osd);
2169     }
2170   }
2171 }
2172
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2174 {
2175   if (g_conf()->mon_warn_on_filestore_osds &&
2176       filestore_osds.size() > 0) {
2177     ostringstream ss, deprecated_tip;
2178     list<string> detail;
2179     ss << filestore_osds.size()
2180        << " osd(s) "
2181        << (filestore_osds.size() == 1 ? "is" : "are")
2182        << " running Filestore";
2183     deprecated_tip << ss.str();
2184     ss << " [Deprecated]";
2185     auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2186                           filestore_osds.size());
2187     deprecated_tip << ", which has been deprecated and"
2188                    << " not been optimized for QoS"
2189                    << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190     detail.push_back(deprecated_tip.str());
2191     d.detail.swap(detail);
2192   }
2193 }
2194
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2196                                                  const pg_pool_t &pool,
2197                                                  ostream *err)
2198 {
2199   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200   // since filestore osds could always join the pool later
2201   set<int> checked_osds;
2202   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2203     vector<int> up, acting;
2204     pg_t pgid(ps, pool_id);
2205     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2206     for (int osd : up) {
2207       if (checked_osds.find(osd) != checked_osds.end())
2208         continue;
2209       string objectstore_type;
2210       int r = get_osd_objectstore_type(osd, &objectstore_type);
2211       // allow with missing metadata, e.g. due to an osd never booting yet
2212       if (r < 0 || objectstore_type == "bluestore") {
2213         checked_osds.insert(osd);
2214         continue;
2215       }
2216       *err << "osd." << osd << " uses " << objectstore_type;
2217       return false;
2218     }
2219   }
2220   return true;
2221 }
2222
2223 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2224 {
2225   map<string,string> m;
2226   if (int r = load_metadata(osd, m, err))
2227     return r;
2228   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2229     f->dump_string(p->first.c_str(), p->second);
2230   return 0;
2231 }
2232
2233 void OSDMonitor::print_nodes(Formatter *f)
2234 {
2235   // group OSDs by their hosts
2236   map<string, list<int> > osds; // hostname => osd
2237   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2238     map<string, string> m;
2239     if (load_metadata(osd, m, NULL)) {
2240       continue;
2241     }
2242     map<string, string>::iterator hostname = m.find("hostname");
2243     if (hostname == m.end()) {
2244       // not likely though
2245       continue;
2246     }
2247     osds[hostname->second].push_back(osd);
2248   }
2249
2250   dump_services(f, osds, "osd");
2251 }
2252
2253 void OSDMonitor::share_map_with_random_osd()
2254 {
2255   if (osdmap.get_num_up_osds() == 0) {
2256     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2257     return;
2258   }
2259
2260   MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2261   if (!s) {
2262     dout(10) << __func__ << " no up osd on our session map" << dendl;
2263     return;
2264   }
2265
2266   dout(10) << "committed, telling random " << s->name
2267            << " all about it" << dendl;
2268
2269   // get feature of the peer
2270   // use quorum_con_features, if it's an anonymous connection.
2271   uint64_t features = s->con_features ? s->con_features :
2272                                         mon.get_quorum_con_features();
2273   // whatev, they'll request more if they need it
2274   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2275   s->con->send_message(m);
2276   // NOTE: do *not* record osd has up to this epoch (as we do
2277   // elsewhere) as they may still need to request older values.
2278 }
2279
2280 version_t OSDMonitor::get_trim_to() const
2281 {
2282   if (mon.get_quorum().empty()) {
2283     dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2284     return 0;
2285   }
2286
2287   {
2288     std::lock_guard<std::mutex> l(creating_pgs_lock);
2289     if (!creating_pgs.pgs.empty()) {
2290       dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2291       return 0;
2292     }
2293   }
2294
2295   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2296     dout(0) << __func__
2297             << " blocking osdmap trim"
2298             << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299             << " trim_to = 0" << dendl;
2300     return 0;
2301   }
2302
2303   {
2304     epoch_t floor = get_min_last_epoch_clean();
2305     dout(10) << " min_last_epoch_clean " << floor << dendl;
2306     if (g_conf()->mon_osd_force_trim_to > 0 &&
2307         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2308       floor = g_conf()->mon_osd_force_trim_to;
2309       dout(10) << __func__
2310                << " explicit mon_osd_force_trim_to = " << floor << dendl;
2311     }
2312     unsigned min = g_conf()->mon_min_osdmap_epochs;
2313     if (floor + min > get_last_committed()) {
2314       if (min < get_last_committed())
2315         floor = get_last_committed() - min;
2316       else
2317         floor = 0;
2318     }
2319     if (floor > get_first_committed()) {
2320       dout(10) << __func__ << " trim_to = " << floor << dendl;
2321       return floor;
2322     }
2323   }
2324   dout(10) << __func__ << " trim_to = 0" << dendl;
2325   return 0;
2326 }
2327
2328 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2329 {
2330   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2331   // also scan osd epochs
2332   // don't trim past the oldest reported osd epoch
2333   for (auto [osd, epoch] : osd_epochs) {
2334     if (epoch < floor) {
2335       floor = epoch;
2336     }
2337   }
2338   return floor;
2339 }
2340
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2342                                    version_t first)
2343 {
2344   dout(10) << __func__ << " including full map for e " << first << dendl;
2345   bufferlist bl;
2346   get_version_full(first, bl);
2347   put_version_full(tx, first, bl);
2348
2349   if (has_osdmap_manifest &&
2350       first > osdmap_manifest.get_first_pinned()) {
2351     _prune_update_trimmed(tx, first);
2352   }
2353 }
2354
2355
2356 /* full osdmap prune
2357  *
2358  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2359  */
2360
2361 void OSDMonitor::load_osdmap_manifest()
2362 {
2363   bool store_has_manifest =
2364     mon.store->exists(get_service_name(), "osdmap_manifest");
2365
2366   if (!store_has_manifest) {
2367     if (!has_osdmap_manifest) {
2368       return;
2369     }
2370
2371     dout(20) << __func__
2372              << " dropping osdmap manifest from memory." << dendl;
2373     osdmap_manifest = osdmap_manifest_t();
2374     has_osdmap_manifest = false;
2375     return;
2376   }
2377
2378   dout(20) << __func__
2379            << " osdmap manifest detected in store; reload." << dendl;
2380
2381   bufferlist manifest_bl;
2382   int r = get_value("osdmap_manifest", manifest_bl);
2383   if (r < 0) {
2384     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2385     ceph_abort_msg("error reading manifest");
2386   }
2387   osdmap_manifest.decode(manifest_bl);
2388   has_osdmap_manifest = true;
2389
2390   dout(10) << __func__ << " store osdmap manifest pinned ("
2391            << osdmap_manifest.get_first_pinned()
2392            << " .. "
2393            << osdmap_manifest.get_last_pinned()
2394            << ")"
2395            << dendl;
2396 }
2397
2398 bool OSDMonitor::should_prune() const
2399 {
2400   version_t first = get_first_committed();
2401   version_t last = get_last_committed();
2402   version_t min_osdmap_epochs =
2403     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2404   version_t prune_min =
2405     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2406   version_t prune_interval =
2407     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2408   version_t last_pinned = osdmap_manifest.get_last_pinned();
2409   version_t last_to_pin = last - min_osdmap_epochs;
2410
2411   // Make it or break it constraints.
2412   //
2413   // If any of these conditions fails, we will not prune, regardless of
2414   // whether we have an on-disk manifest with an on-going pruning state.
2415   //
2416   if ((last - first) <= min_osdmap_epochs) {
2417     // between the first and last committed epochs, we don't have
2418     // enough epochs to trim, much less to prune.
2419     dout(10) << __func__
2420              << " currently holding only " << (last - first)
2421              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422              << "); do not prune."
2423              << dendl;
2424     return false;
2425
2426   } else if ((last_to_pin - first) < prune_min) {
2427     // between the first committed epoch and the last epoch we would prune,
2428     // we simply don't have enough versions over the minimum to prune maps.
2429     dout(10) << __func__
2430              << " could only prune " << (last_to_pin - first)
2431              << " epochs (" << first << ".." << last_to_pin << "), which"
2432                 " is less than the required minimum (" << prune_min << ")"
2433              << dendl;
2434     return false;
2435
2436   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2437     dout(10) << __func__
2438              << " we have pruned as far as we can; do not prune."
2439              << dendl;
2440     return false;
2441
2442   } else if (last_pinned + prune_interval > last_to_pin) {
2443     dout(10) << __func__
2444              << " not enough epochs to form an interval (last pinned: "
2445              << last_pinned << ", last to pin: "
2446              << last_to_pin << ", interval: " << prune_interval << ")"
2447              << dendl;
2448     return false;
2449   }
2450
2451   dout(15) << __func__
2452            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2453            << " lc (" << first << ".." << last << ")"
2454            << dendl;
2455   return true;
2456 }
2457
2458 void OSDMonitor::_prune_update_trimmed(
2459     MonitorDBStore::TransactionRef tx,
2460     version_t first)
2461 {
2462   dout(10) << __func__
2463            << " first " << first
2464            << " last_pinned " << osdmap_manifest.get_last_pinned()
2465            << dendl;
2466
2467   osdmap_manifest_t manifest = osdmap_manifest;
2468
2469   if (!manifest.is_pinned(first)) {
2470     manifest.pin(first);
2471   }
2472
2473   set<version_t>::iterator p_end = manifest.pinned.find(first);
2474   set<version_t>::iterator p = manifest.pinned.begin();
2475   manifest.pinned.erase(p, p_end);
2476   ceph_assert(manifest.get_first_pinned() == first);
2477
2478   if (manifest.get_last_pinned() == first+1 ||
2479       manifest.pinned.size() == 1) {
2480     // we reached the end of the line, as pinned maps go; clean up our
2481     // manifest, and let `should_prune()` decide whether we should prune
2482     // again.
2483     tx->erase(get_service_name(), "osdmap_manifest");
2484     return;
2485   }
2486
2487   bufferlist bl;
2488   manifest.encode(bl);
2489   tx->put(get_service_name(), "osdmap_manifest", bl);
2490 }
2491
2492 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2493 {
2494   dout(1) << __func__ << dendl;
2495
2496   version_t pin_first;
2497
2498   // verify constrainsts on stable in-memory state
2499   if (!has_osdmap_manifest) {
2500     // we must have never pruned, OR if we pruned the state must no longer
2501     // be relevant (i.e., the state must have been removed alongside with
2502     // the trim that *must* have removed past the last pinned map in a
2503     // previous prune).
2504     ceph_assert(osdmap_manifest.pinned.empty());
2505     ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2506     pin_first = get_first_committed();
2507
2508   } else {
2509     // we must have pruned in the past AND its state is still relevant
2510     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511     // and thus we still hold a manifest in the store).
2512     ceph_assert(!osdmap_manifest.pinned.empty());
2513     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2514     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2515
2516     dout(10) << __func__
2517              << " first_pinned " << osdmap_manifest.get_first_pinned()
2518              << " last_pinned " << osdmap_manifest.get_last_pinned()
2519              << dendl;
2520
2521     pin_first = osdmap_manifest.get_last_pinned();
2522   }
2523
2524   manifest.pin(pin_first);
2525 }
2526
2527 bool OSDMonitor::_prune_sanitize_options() const
2528 {
2529   uint64_t prune_interval =
2530     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2531   uint64_t prune_min =
2532     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2533   uint64_t txsize =
2534     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2535
2536   bool r = true;
2537
2538   if (prune_interval == 0) {
2539     derr << __func__
2540          << " prune is enabled BUT prune interval is zero; abort."
2541          << dendl;
2542     r = false;
2543   } else if (prune_interval == 1) {
2544     derr << __func__
2545          << " prune interval is equal to one, which essentially means"
2546             " no pruning; abort."
2547          << dendl;
2548     r = false;
2549   }
2550   if (prune_min == 0) {
2551     derr << __func__
2552          << " prune is enabled BUT prune min is zero; abort."
2553          << dendl;
2554     r = false;
2555   }
2556   if (prune_interval > prune_min) {
2557     derr << __func__
2558          << " impossible to ascertain proper prune interval because"
2559          << " it is greater than the minimum prune epochs"
2560          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2561          << dendl;
2562     r = false;
2563   }
2564
2565   if (txsize < prune_interval - 1) {
2566     derr << __func__
2567          << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2569          << "); abort." << dendl;
2570     r = false;
2571   }
2572   return r;
2573 }
2574
2575 bool OSDMonitor::is_prune_enabled() const {
2576   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2577 }
2578
2579 bool OSDMonitor::is_prune_supported() const {
2580   return mon.get_required_mon_features().contains_any(
2581       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2582 }
2583
2584 /** do_prune
2585  *
2586  * @returns true if has side-effects; false otherwise.
2587  */
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2589 {
2590   bool enabled = is_prune_enabled();
2591
2592   dout(1) << __func__ << " osdmap full prune "
2593           << ( enabled ? "enabled" : "disabled")
2594           << dendl;
2595
2596   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2597     return false;
2598   }
2599
2600   // we are beyond the minimum prune versions, we need to remove maps because
2601   // otherwise the store will grow unbounded and we may end up having issues
2602   // with available disk space or store hangs.
2603
2604   // we will not pin all versions. We will leave a buffer number of versions.
2605   // this allows us the monitor to trim maps without caring too much about
2606   // pinned maps, and then allow us to use another ceph-mon without these
2607   // capabilities, without having to repair the store.
2608
2609   osdmap_manifest_t manifest = osdmap_manifest;
2610
2611   version_t first = get_first_committed();
2612   version_t last = get_last_committed();
2613
2614   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2615   version_t last_pinned = manifest.get_last_pinned();
2616   uint64_t prune_interval =
2617     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2618   uint64_t txsize =
2619     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2620
2621   prune_init(manifest);
2622
2623   // we need to get rid of some osdmaps
2624
2625   dout(5) << __func__
2626           << " lc (" << first << " .. " << last << ")"
2627           << " last_pinned " << last_pinned
2628           << " interval " << prune_interval
2629           << " last_to_pin " << last_to_pin
2630           << dendl;
2631
2632   // We will be erasing maps as we go.
2633   //
2634   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2635   //
2636   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637   // we stop pruning. We could prune the maps between `next_to_pin` and
2638   // `last_to_pin`, but by not doing it we end up with neater pruned
2639   // intervals, aligned with `prune_interval`. Besides, this should not be a
2640   // problem as long as `prune_interval` is set to a sane value, instead of
2641   // hundreds or thousands of maps.
2642
2643   auto map_exists = [this](version_t v) {
2644     string k = mon.store->combine_strings("full", v);
2645     return mon.store->exists(get_service_name(), k);
2646   };
2647
2648   // 'interval' represents the number of maps from the last pinned
2649   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650   // version 11 next; all intermediate versions will be removed.
2651   //
2652   // 'txsize' represents the maximum number of versions we'll be removing in
2653   // this iteration. If 'txsize' is large enough to perform multiple passes
2654   // pinning and removing maps, we will do so; if not, we'll do at least one
2655   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656   // ensure that we never go *over* the maximum.
2657
2658   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659   uint64_t removal_interval = prune_interval - 1;
2660
2661   if (txsize < removal_interval) {
2662     dout(5) << __func__
2663             << " setting txsize to removal interval size ("
2664             << removal_interval << " versions"
2665             << dendl;
2666     txsize = removal_interval;
2667   }
2668   ceph_assert(removal_interval > 0);
2669
2670   uint64_t num_pruned = 0;
2671   while (num_pruned + removal_interval <= txsize) {
2672     last_pinned = manifest.get_last_pinned();
2673
2674     if (last_pinned + prune_interval > last_to_pin) {
2675       break;
2676     }
2677     ceph_assert(last_pinned < last_to_pin);
2678
2679     version_t next_pinned = last_pinned + prune_interval;
2680     ceph_assert(next_pinned <= last_to_pin);
2681     manifest.pin(next_pinned);
2682
2683     dout(20) << __func__
2684              << " last_pinned " << last_pinned
2685              << " next_pinned " << next_pinned
2686              << " num_pruned " << num_pruned
2687              << " removal interval (" << (last_pinned+1)
2688              << ".." << (next_pinned-1) << ")"
2689              << " txsize " << txsize << dendl;
2690
2691     ceph_assert(map_exists(last_pinned));
2692     ceph_assert(map_exists(next_pinned));
2693
2694     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2695       ceph_assert(!manifest.is_pinned(v));
2696
2697       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2698       string full_key = mon.store->combine_strings("full", v);
2699       tx->erase(get_service_name(), full_key);
2700       ++num_pruned;
2701     }
2702   }
2703
2704   ceph_assert(num_pruned > 0);
2705
2706   bufferlist bl;
2707   manifest.encode(bl);
2708   tx->put(get_service_name(), "osdmap_manifest", bl);
2709
2710   return true;
2711 }
2712
2713
2714 // -------------
2715
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2717 {
2718   op->mark_osdmon_event(__func__);
2719   Message *m = op->get_req();
2720   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2721
2722   switch (m->get_type()) {
2723     // READs
2724   case MSG_MON_COMMAND:
2725     try {
2726       return preprocess_command(op);
2727     } catch (const bad_cmd_get& e) {
2728       bufferlist bl;
2729       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2730       return true;
2731     }
2732   case CEPH_MSG_MON_GET_OSDMAP:
2733     return preprocess_get_osdmap(op);
2734
2735     // damp updates
2736   case MSG_OSD_MARK_ME_DOWN:
2737     return preprocess_mark_me_down(op);
2738   case MSG_OSD_MARK_ME_DEAD:
2739     return preprocess_mark_me_dead(op);
2740   case MSG_OSD_FULL:
2741     return preprocess_full(op);
2742   case MSG_OSD_FAILURE:
2743     return preprocess_failure(op);
2744   case MSG_OSD_BOOT:
2745     return preprocess_boot(op);
2746   case MSG_OSD_ALIVE:
2747     return preprocess_alive(op);
2748   case MSG_OSD_PG_CREATED:
2749     return preprocess_pg_created(op);
2750   case MSG_OSD_PG_READY_TO_MERGE:
2751     return preprocess_pg_ready_to_merge(op);
2752   case MSG_OSD_PGTEMP:
2753     return preprocess_pgtemp(op);
2754   case MSG_OSD_BEACON:
2755     return preprocess_beacon(op);
2756
2757   case CEPH_MSG_POOLOP:
2758     return preprocess_pool_op(op);
2759
2760   case MSG_REMOVE_SNAPS:
2761     return preprocess_remove_snaps(op);
2762
2763   case MSG_MON_GET_PURGED_SNAPS:
2764     return preprocess_get_purged_snaps(op);
2765
2766   default:
2767     ceph_abort();
2768     return true;
2769   }
2770 }
2771
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2773 {
2774   op->mark_osdmon_event(__func__);
2775   Message *m = op->get_req();
2776   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2777
2778   switch (m->get_type()) {
2779     // damp updates
2780   case MSG_OSD_MARK_ME_DOWN:
2781     return prepare_mark_me_down(op);
2782   case MSG_OSD_MARK_ME_DEAD:
2783     return prepare_mark_me_dead(op);
2784   case MSG_OSD_FULL:
2785     return prepare_full(op);
2786   case MSG_OSD_FAILURE:
2787     return prepare_failure(op);
2788   case MSG_OSD_BOOT:
2789     return prepare_boot(op);
2790   case MSG_OSD_ALIVE:
2791     return prepare_alive(op);
2792   case MSG_OSD_PG_CREATED:
2793     return prepare_pg_created(op);
2794   case MSG_OSD_PGTEMP:
2795     return prepare_pgtemp(op);
2796   case MSG_OSD_PG_READY_TO_MERGE:
2797     return prepare_pg_ready_to_merge(op);
2798   case MSG_OSD_BEACON:
2799     return prepare_beacon(op);
2800
2801   case MSG_MON_COMMAND:
2802     try {
2803       return prepare_command(op);
2804     } catch (const bad_cmd_get& e) {
2805       bufferlist bl;
2806       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2807       return true;
2808     }
2809
2810   case CEPH_MSG_POOLOP:
2811     return prepare_pool_op(op);
2812
2813   case MSG_REMOVE_SNAPS:
2814     return prepare_remove_snaps(op);
2815
2816
2817   default:
2818     ceph_abort();
2819   }
2820
2821   return false;
2822 }
2823
2824 bool OSDMonitor::should_propose(double& delay)
2825 {
2826   dout(10) << "should_propose" << dendl;
2827
2828   // if full map, propose immediately!  any subsequent changes will be clobbered.
2829   if (pending_inc.fullmap.length())
2830     return true;
2831
2832   // adjust osd weights?
2833   if (!osd_weight.empty() &&
2834       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2835     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2836     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2837     delay = 0.0;
2838     osd_weight.clear();
2839     return true;
2840   }
2841
2842   return PaxosService::should_propose(delay);
2843 }
2844
2845
2846
2847 // ---------------------------
2848 // READs
2849
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2851 {
2852   op->mark_osdmon_event(__func__);
2853   auto m = op->get_req<MMonGetOSDMap>();
2854
2855   uint64_t features = mon.get_quorum_con_features();
2856   if (op->get_session() && op->get_session()->con_features)
2857     features = op->get_session()->con_features;
2858
2859   dout(10) << __func__ << " " << *m << dendl;
2860   MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2861   epoch_t first = get_first_committed();
2862   epoch_t last = osdmap.get_epoch();
2863   int max = g_conf()->osd_map_message_max;
2864   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2865   for (epoch_t e = std::max(first, m->get_full_first());
2866        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2867        ++e, --max) {
2868     bufferlist& bl = reply->maps[e];
2869     int r = get_version_full(e, features, bl);
2870     ceph_assert(r >= 0);
2871     max_bytes -= bl.length();
2872   }
2873   for (epoch_t e = std::max(first, m->get_inc_first());
2874        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2875        ++e, --max) {
2876     bufferlist& bl = reply->incremental_maps[e];
2877     int r = get_version(e, features, bl);
2878     ceph_assert(r >= 0);
2879     max_bytes -= bl.length();
2880   }
2881   reply->oldest_map = first;
2882   reply->newest_map = last;
2883   mon.send_reply(op, reply);
2884   return true;
2885 }
2886
2887
2888 // ---------------------------
2889 // UPDATEs
2890
2891 // failure --
2892
2893 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2894   // check permissions
2895   MonSession *session = op->get_session();
2896   if (!session)
2897     return true;
2898   if (!session->is_capable("osd", MON_CAP_X)) {
2899     dout(0) << "got MOSDFailure from entity with insufficient caps "
2900             << session->caps << dendl;
2901     return true;
2902   }
2903   if (fsid != mon.monmap->fsid) {
2904     dout(0) << "check_source: on fsid " << fsid
2905             << " != " << mon.monmap->fsid << dendl;
2906     return true;
2907   }
2908   return false;
2909 }
2910
2911
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2913 {
2914   op->mark_osdmon_event(__func__);
2915   auto m = op->get_req<MOSDFailure>();
2916   // who is target_osd
2917   int badboy = m->get_target_osd();
2918
2919   // check permissions
2920   if (check_source(op, m->fsid))
2921     goto didit;
2922
2923   // first, verify the reporting host is valid
2924   if (m->get_orig_source().is_osd()) {
2925     int from = m->get_orig_source().num();
2926     if (!osdmap.exists(from) ||
2927         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2928         (osdmap.is_down(from) && m->if_osd_failed())) {
2929       dout(5) << "preprocess_failure from dead osd." << from
2930               << ", ignoring" << dendl;
2931       send_incremental(op, m->get_epoch()+1);
2932       goto didit;
2933     }
2934   }
2935
2936
2937   // weird?
2938   if (osdmap.is_down(badboy)) {
2939     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2940             << " " << m->get_target_addrs()
2941             << ", from " << m->get_orig_source() << dendl;
2942     if (m->get_epoch() < osdmap.get_epoch())
2943       send_incremental(op, m->get_epoch()+1);
2944     goto didit;
2945   }
2946   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2947     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2948             << " " << m->get_target_addrs()
2949             << " != map's " << osdmap.get_addrs(badboy)
2950             << ", from " << m->get_orig_source() << dendl;
2951     if (m->get_epoch() < osdmap.get_epoch())
2952       send_incremental(op, m->get_epoch()+1);
2953     goto didit;
2954   }
2955
2956   // already reported?
2957   if (osdmap.is_down(badboy) ||
2958       osdmap.get_up_from(badboy) > m->get_epoch()) {
2959     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2960             << " " << m->get_target_addrs()
2961             << ", from " << m->get_orig_source() << dendl;
2962     if (m->get_epoch() < osdmap.get_epoch())
2963       send_incremental(op, m->get_epoch()+1);
2964     goto didit;
2965   }
2966
2967   if (!can_mark_down(badboy)) {
2968     dout(5) << "preprocess_failure ignoring report of osd."
2969             << m->get_target_osd() << " " << m->get_target_addrs()
2970             << " from " << m->get_orig_source() << dendl;
2971     goto didit;
2972   }
2973
2974   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2975            << " " << m->get_target_addrs()
2976            << ", from " << m->get_orig_source() << dendl;
2977   return false;
2978
2979  didit:
2980   mon.no_reply(op);
2981   return true;
2982 }
2983
2984 class C_AckMarkedDown : public C_MonOp {
2985   OSDMonitor *osdmon;
2986 public:
2987   C_AckMarkedDown(
2988     OSDMonitor *osdmon,
2989     MonOpRequestRef op)
2990     : C_MonOp(op), osdmon(osdmon) {}
2991
2992   void _finish(int r) override {
2993     if (r == 0) {
2994       auto m = op->get_req<MOSDMarkMeDown>();
2995       osdmon->mon.send_reply(
2996         op,
2997         new MOSDMarkMeDown(
2998           m->fsid,
2999           m->target_osd,
3000           m->target_addrs,
3001           m->get_epoch(),
3002           false));   // ACK itself does not request an ack
3003     } else if (r == -EAGAIN) {
3004         osdmon->dispatch(op);
3005     } else {
3006         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3007     }
3008   }
3009   ~C_AckMarkedDown() override {
3010   }
3011 };
3012
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3014 {
3015   op->mark_osdmon_event(__func__);
3016   auto m = op->get_req<MOSDMarkMeDown>();
3017   int from = m->target_osd;
3018
3019   // check permissions
3020   if (check_source(op, m->fsid))
3021     goto reply;
3022
3023   // first, verify the reporting host is valid
3024   if (!m->get_orig_source().is_osd())
3025     goto reply;
3026
3027   if (!osdmap.exists(from) ||
3028       osdmap.is_down(from) ||
3029       osdmap.get_addrs(from) != m->target_addrs) {
3030     dout(5) << "preprocess_mark_me_down from dead osd."
3031             << from << ", ignoring" << dendl;
3032     send_incremental(op, m->get_epoch()+1);
3033     goto reply;
3034   }
3035
3036   // no down might be set
3037   if (!can_mark_down(from))
3038     goto reply;
3039
3040   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3041            << " " << m->target_addrs << dendl;
3042   return false;
3043
3044  reply:
3045   if (m->request_ack) {
3046     Context *c(new C_AckMarkedDown(this, op));
3047     c->complete(0);
3048   }
3049   return true;
3050 }
3051
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3053 {
3054   op->mark_osdmon_event(__func__);
3055   auto m = op->get_req<MOSDMarkMeDown>();
3056   int target_osd = m->target_osd;
3057
3058   ceph_assert(osdmap.is_up(target_osd));
3059   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3060
3061   mon.clog->info() << "osd." << target_osd << " marked itself down";
3062   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3063   if (m->request_ack)
3064     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3065   return true;
3066 }
3067
3068 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3069 {
3070   op->mark_osdmon_event(__func__);
3071   auto m = op->get_req<MOSDMarkMeDead>();
3072   int from = m->target_osd;
3073
3074   // check permissions
3075   if (check_source(op, m->fsid)) {
3076     mon.no_reply(op);
3077     return true;
3078   }
3079
3080   // first, verify the reporting host is valid
3081   if (!m->get_orig_source().is_osd()) {
3082     mon.no_reply(op);
3083     return true;
3084   }
3085
3086   if (!osdmap.exists(from) ||
3087       !osdmap.is_down(from)) {
3088     dout(5) << __func__ << " from nonexistent or up osd." << from
3089             << ", ignoring" << dendl;
3090     send_incremental(op, m->get_epoch()+1);
3091     mon.no_reply(op);
3092     return true;
3093   }
3094
3095   return false;
3096 }
3097
3098 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3099 {
3100   op->mark_osdmon_event(__func__);
3101   auto m = op->get_req<MOSDMarkMeDead>();
3102   int target_osd = m->target_osd;
3103
3104   ceph_assert(osdmap.is_down(target_osd));
3105
3106   mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3107                     << m->get_epoch();
3108   if (!pending_inc.new_xinfo.count(target_osd)) {
3109     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3110   }
3111   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3112   wait_for_finished_proposal(
3113     op,
3114     new LambdaContext(
3115       [op, this] (int r) {
3116         if (r >= 0) {
3117           mon.no_reply(op);       // ignore on success
3118         }
3119       }
3120       ));
3121   return true;
3122 }
3123
3124 bool OSDMonitor::can_mark_down(int i)
3125 {
3126   if (osdmap.is_nodown(i)) {
3127     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3128             << "will not mark it down" << dendl;
3129     return false;
3130   }
3131
3132   int num_osds = osdmap.get_num_osds();
3133   if (num_osds == 0) {
3134     dout(5) << __func__ << " no osds" << dendl;
3135     return false;
3136   }
3137   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3138   float up_ratio = (float)up / (float)num_osds;
3139   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3140     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3141             << g_conf()->mon_osd_min_up_ratio
3142             << ", will not mark osd." << i << " down" << dendl;
3143     return false;
3144   }
3145   return true;
3146 }
3147
3148 bool OSDMonitor::can_mark_up(int i)
3149 {
3150   if (osdmap.is_noup(i)) {
3151     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3152             << "will not mark it up" << dendl;
3153     return false;
3154   }
3155
3156   return true;
3157 }
3158
3159 /**
3160  * @note the parameter @p i apparently only exists here so we can output the
3161  *       osd's id on messages.
3162  */
3163 bool OSDMonitor::can_mark_out(int i)
3164 {
3165   if (osdmap.is_noout(i)) {
3166     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3167             << "will not mark it out" << dendl;
3168     return false;
3169   }
3170
3171   int num_osds = osdmap.get_num_osds();
3172   if (num_osds == 0) {
3173     dout(5) << __func__ << " no osds" << dendl;
3174     return false;
3175   }
3176   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3177   float in_ratio = (float)in / (float)num_osds;
3178   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3179     if (i >= 0)
3180       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3181               << g_conf()->mon_osd_min_in_ratio
3182               << ", will not mark osd." << i << " out" << dendl;
3183     else
3184       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3185               << g_conf()->mon_osd_min_in_ratio
3186               << ", will not mark osds out" << dendl;
3187     return false;
3188   }
3189
3190   return true;
3191 }
3192
3193 bool OSDMonitor::can_mark_in(int i)
3194 {
3195   if (osdmap.is_noin(i)) {
3196     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3197             << "will not mark it in" << dendl;
3198     return false;
3199   }
3200
3201   return true;
3202 }
3203
3204 bool OSDMonitor::check_failures(utime_t now)
3205 {
3206   bool found_failure = false;
3207   auto p = failure_info.begin();
3208   while (p != failure_info.end()) {
3209     auto& [target_osd, fi] = *p;
3210     if (can_mark_down(target_osd) &&
3211         check_failure(now, target_osd, fi)) {
3212       found_failure = true;
3213       ++p;
3214     } else if (is_failure_stale(now, fi)) {
3215       dout(10) << " dropping stale failure_info for osd." << target_osd
3216                << " from " << fi.reporters.size() << " reporters"
3217                << dendl;
3218       p = failure_info.erase(p);
3219     } else {
3220       ++p;
3221     }
3222   }
3223   return found_failure;
3224 }
3225
3226 utime_t OSDMonitor::get_grace_time(utime_t now,
3227                                    int target_osd,
3228                                    failure_info_t& fi) const
3229 {
3230   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3231   if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3232     return orig_grace;
3233   }
3234   utime_t grace = orig_grace;
3235   double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3236   double decay_k = ::log(.5) / halflife;
3237
3238   // scale grace period based on historical probability of 'lagginess'
3239   // (false positive failures due to slowness).
3240   const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3241   const utime_t failed_for = now - fi.get_failed_since();
3242   double decay = exp((double)failed_for * decay_k);
3243   dout(20) << " halflife " << halflife << " decay_k " << decay_k
3244            << " failed_for " << failed_for << " decay " << decay << dendl;
3245   double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3246   grace += my_grace;
3247
3248   // consider the peers reporting a failure a proxy for a potential
3249   // 'subcluster' over the overall cluster that is similarly
3250   // laggy.  this is clearly not true in all cases, but will sometimes
3251   // help us localize the grace correction to a subset of the system
3252   // (say, a rack with a bad switch) that is unhappy.
3253   double peer_grace = 0;
3254   for (auto& [reporter, report] : fi.reporters) {
3255     if (osdmap.exists(reporter)) {
3256       const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3257       utime_t elapsed = now - xi.down_stamp;
3258       double decay = exp((double)elapsed * decay_k);
3259       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3260     }
3261   }
3262   peer_grace /= (double)fi.reporters.size();
3263   grace += peer_grace;
3264   dout(10) << " osd." << target_osd << " has "
3265            << fi.reporters.size() << " reporters, "
3266            << grace << " grace (" << orig_grace << " + " << my_grace
3267            << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3268            << dendl;
3269
3270   return grace;
3271 }
3272
3273 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3274 {
3275   // already pending failure?
3276   if (pending_inc.new_state.count(target_osd) &&
3277       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3278     dout(10) << " already pending failure" << dendl;
3279     return true;
3280   }
3281
3282   set<string> reporters_by_subtree;
3283   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3284   ceph_assert(fi.reporters.size());
3285   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3286     // get the parent bucket whose type matches with "reporter_subtree_level".
3287     // fall back to OSD if the level doesn't exist.
3288     if (osdmap.exists(p->first)) {
3289       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3290       if (auto iter = reporter_loc.find(reporter_subtree_level);
3291           iter == reporter_loc.end()) {
3292         reporters_by_subtree.insert("osd." + to_string(p->first));
3293       } else {
3294         reporters_by_subtree.insert(iter->second);
3295       }
3296       ++p;
3297     } else {
3298       fi.cancel_report(p->first);;
3299       p = fi.reporters.erase(p);
3300     }
3301   }
3302   if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3303     return false;
3304   }
3305   const utime_t failed_for = now - fi.get_failed_since();
3306   const utime_t grace = get_grace_time(now, target_osd, fi);
3307   if (failed_for >= grace) {
3308     dout(1) << " we have enough reporters to mark osd." << target_osd
3309             << " down" << dendl;
3310     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3311
3312     mon.clog->info() << "osd." << target_osd << " failed ("
3313                       << osdmap.crush->get_full_location_ordered_string(
3314                         target_osd)
3315                       << ") ("
3316                       << (int)reporters_by_subtree.size()
3317                       << " reporters from different "
3318                       << reporter_subtree_level << " after "
3319                       << failed_for << " >= grace " << grace << ")";
3320     return true;
3321   }
3322   return false;
3323 }
3324
3325 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3326 {
3327   // if it takes too long to either cancel the report to mark the osd down,
3328   // some reporters must have failed to cancel their reports. let's just
3329   // forget these reports.
3330   const utime_t failed_for = now - fi.get_failed_since();
3331   auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3332   auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3333   return failed_for >= (heartbeat_grace + heartbeat_stale);
3334 }
3335
3336 void OSDMonitor::force_failure(int target_osd, int by)
3337 {
3338   // already pending failure?
3339   if (pending_inc.new_state.count(target_osd) &&
3340       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3341     dout(10) << " already pending failure" << dendl;
3342     return;
3343   }
3344
3345   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3346   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3347   if (!pending_inc.new_xinfo.count(target_osd)) {
3348     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3349   }
3350   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3351
3352   mon.clog->info() << "osd." << target_osd << " failed ("
3353                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3354                     << ") (connection refused reported by osd." << by << ")";
3355   return;
3356 }
3357
3358 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3359 {
3360   op->mark_osdmon_event(__func__);
3361   auto m = op->get_req<MOSDFailure>();
3362   dout(1) << "prepare_failure osd." << m->get_target_osd()
3363           << " " << m->get_target_addrs()
3364           << " from " << m->get_orig_source()
3365           << " is reporting failure:" << m->if_osd_failed() << dendl;
3366
3367   int target_osd = m->get_target_osd();
3368   int reporter = m->get_orig_source().num();
3369   ceph_assert(osdmap.is_up(target_osd));
3370   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3371
3372   mon.no_reply(op);
3373
3374   if (m->if_osd_failed()) {
3375     // calculate failure time
3376     utime_t now = ceph_clock_now();
3377     utime_t failed_since =
3378       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3379
3380     // add a report
3381     if (m->is_immediate()) {
3382       mon.clog->debug() << "osd." << m->get_target_osd()
3383                          << " reported immediately failed by "
3384                          << m->get_orig_source();
3385       force_failure(target_osd, reporter);
3386       return true;
3387     }
3388     mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3389                       << m->get_orig_source();
3390
3391     failure_info_t& fi = failure_info[target_osd];
3392     fi.add_report(reporter, failed_since, op);
3393     return check_failure(now, target_osd, fi);
3394   } else {
3395     // remove the report
3396     mon.clog->debug() << "osd." << m->get_target_osd()
3397                        << " failure report canceled by "
3398                        << m->get_orig_source();
3399     if (failure_info.count(target_osd)) {
3400       failure_info_t& fi = failure_info[target_osd];
3401       fi.cancel_report(reporter);
3402       if (fi.reporters.empty()) {
3403         dout(10) << " removing last failure_info for osd." << target_osd
3404                  << dendl;
3405         failure_info.erase(target_osd);
3406       } else {
3407         dout(10) << " failure_info for osd." << target_osd << " now "
3408                  << fi.reporters.size() << " reporters" << dendl;
3409       }
3410     } else {
3411       dout(10) << " no failure_info for osd." << target_osd << dendl;
3412     }
3413   }
3414
3415   return false;
3416 }
3417
3418 void OSDMonitor::process_failures()
3419 {
3420   map<int,failure_info_t>::iterator p = failure_info.begin();
3421   while (p != failure_info.end()) {
3422     if (osdmap.is_up(p->first)) {
3423       ++p;
3424     } else {
3425       dout(10) << "process_failures osd." << p->first << dendl;
3426       list<MonOpRequestRef> ls;
3427       p->second.take_report_messages(ls);
3428       failure_info.erase(p++);
3429
3430       while (!ls.empty()) {
3431         MonOpRequestRef o = ls.front();
3432         if (o) {
3433           o->mark_event(__func__);
3434           MOSDFailure *m = o->get_req<MOSDFailure>();
3435           send_latest(o, m->get_epoch());
3436           mon.no_reply(o);
3437         }
3438         ls.pop_front();
3439       }
3440     }
3441   }
3442 }
3443
3444 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3445 {
3446   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3447
3448   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3449        p != failure_info.end();
3450        ++p) {
3451     p->second.take_report_messages(ls);
3452   }
3453   failure_info.clear();
3454 }
3455
3456 int OSDMonitor::get_grace_interval_threshold()
3457 {
3458   int halflife = g_conf()->mon_osd_laggy_halflife;
3459   // Scale the halflife period (default: 1_hr) by
3460   // a factor (48) to calculate the threshold.
3461   int grace_threshold_factor = 48;
3462   return halflife * grace_threshold_factor;
3463 }
3464
3465 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3466 {
3467   int grace_interval_threshold_secs = get_grace_interval_threshold();
3468   if (last_failed_interval > grace_interval_threshold_secs) {
3469     dout(1) << " last_failed_interval " << last_failed_interval
3470             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3471             << dendl;
3472     return true;
3473   }
3474   return false;
3475 }
3476
3477 void OSDMonitor::set_default_laggy_params(int target_osd)
3478 {
3479   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3480     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3481   }
3482   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3483   xi.down_stamp = pending_inc.modified;
3484   xi.laggy_probability = 0.0;
3485   xi.laggy_interval = 0;
3486   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3487 }
3488
3489
3490 // boot --
3491
3492 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3493 {
3494   op->mark_osdmon_event(__func__);
3495   auto m = op->get_req<MOSDBoot>();
3496   int from = m->get_orig_source_inst().name.num();
3497
3498   // check permissions, ignore if failed (no response expected)
3499   MonSession *session = op->get_session();
3500   if (!session)
3501     goto ignore;
3502   if (!session->is_capable("osd", MON_CAP_X)) {
3503     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3504             << session->caps << dendl;
3505     goto ignore;
3506   }
3507
3508   if (m->sb.cluster_fsid != mon.monmap->fsid) {
3509     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3510             << " != " << mon.monmap->fsid << dendl;
3511     goto ignore;
3512   }
3513
3514   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3515     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3516     goto ignore;
3517   }
3518
3519   ceph_assert(m->get_orig_source_inst().name.is_osd());
3520
3521   // lower bound of N-2
3522   if (!HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS)) {
3523     mon.clog->info() << "disallowing boot of OSD "
3524                      << m->get_orig_source_inst()
3525                      << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3526     goto ignore;
3527   }
3528
3529   // make sure osd versions do not span more than 3 releases
3530   if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3531       osdmap.require_osd_release < ceph_release_t::nautilus) {
3532     mon.clog->info() << "disallowing boot of pacific+ OSD "
3533                       << m->get_orig_source_inst()
3534                       << " because require_osd_release < nautilus";
3535     goto ignore;
3536   }
3537   if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3538       osdmap.require_osd_release < ceph_release_t::octopus) {
3539     mon.clog->info() << "disallowing boot of quincy+ OSD "
3540                       << m->get_orig_source_inst()
3541                       << " because require_osd_release < octopus";
3542     goto ignore;
3543   }
3544
3545   if (osdmap.stretch_mode_enabled &&
3546       !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3547     mon.clog->info() << "disallowing boot of OSD "
3548                       << m->get_orig_source_inst()
3549                       << " because stretch mode is on and OSD lacks support";
3550     goto ignore;
3551   }
3552
3553   // already booted?
3554   if (osdmap.is_up(from) &&
3555       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3556       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3557     // yup.
3558     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3559             << " " << m->get_orig_source_addrs()
3560             << " =~ " << osdmap.get_addrs(from) << dendl;
3561     _booted(op, false);
3562     return true;
3563   }
3564
3565   if (osdmap.exists(from) &&
3566       !osdmap.get_uuid(from).is_zero() &&
3567       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3568     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3569             << " clashes with existing osd: different fsid"
3570             << " (ours: " << osdmap.get_uuid(from)
3571             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3572     goto ignore;
3573   }
3574
3575   if (osdmap.exists(from) &&
3576       osdmap.get_info(from).up_from > m->version &&
3577       osdmap.get_most_recent_addrs(from).legacy_equals(
3578         m->get_orig_source_addrs())) {
3579     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3580     send_latest(op, m->sb.current_epoch+1);
3581     return true;
3582   }
3583
3584   // noup?
3585   if (!can_mark_up(from)) {
3586     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3587     send_latest(op, m->sb.current_epoch+1);
3588     return true;
3589   }
3590
3591   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3592   return false;
3593
3594  ignore:
3595   return true;
3596 }
3597
3598 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3599 {
3600   op->mark_osdmon_event(__func__);
3601   auto m = op->get_req<MOSDBoot>();
3602   dout(7) << __func__ << " from " << m->get_source()
3603           << " sb " << m->sb
3604           << " client_addrs" << m->get_connection()->get_peer_addrs()
3605           << " cluster_addrs " << m->cluster_addrs
3606           << " hb_back_addrs " << m->hb_back_addrs
3607           << " hb_front_addrs " << m->hb_front_addrs
3608           << dendl;
3609
3610   ceph_assert(m->get_orig_source().is_osd());
3611   int from = m->get_orig_source().num();
3612
3613   // does this osd exist?
3614   if (from >= osdmap.get_max_osd()) {
3615     dout(1) << "boot from osd." << from << " >= max_osd "
3616             << osdmap.get_max_osd() << dendl;
3617     return false;
3618   }
3619
3620   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3621   if (pending_inc.new_state.count(from))
3622     oldstate ^= pending_inc.new_state[from];
3623
3624   // already up?  mark down first?
3625   if (osdmap.is_up(from)) {
3626     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3627             << osdmap.get_addrs(from) << dendl;
3628     // preprocess should have caught these;  if not, assert.
3629     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3630                   m->get_orig_source_addrs()) ||
3631                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3632     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3633
3634     if (pending_inc.new_state.count(from) == 0 ||
3635         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3636       // mark previous guy down
3637       pending_inc.new_state[from] = CEPH_OSD_UP;
3638     }
3639     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3640   } else if (pending_inc.new_up_client.count(from)) {
3641     // already prepared, just wait
3642     dout(7) << __func__ << " already prepared, waiting on "
3643             << m->get_orig_source_addr() << dendl;
3644     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3645   } else {
3646     // mark new guy up.
3647     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3648     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3649     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3650     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3651
3652     down_pending_out.erase(from);  // if any
3653
3654     if (m->sb.weight)
3655       osd_weight[from] = m->sb.weight;
3656
3657     // set uuid?
3658     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3659              << dendl;
3660     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3661       // preprocess should have caught this;  if not, assert.
3662       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3663       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3664     }
3665
3666     // fresh osd?
3667     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3668       const osd_info_t& i = osdmap.get_info(from);
3669       if (i.up_from > i.lost_at) {
3670         dout(10) << " fresh osd; marking lost_at too" << dendl;
3671         pending_inc.new_lost[from] = osdmap.get_epoch();
3672       }
3673     }
3674
3675     // metadata
3676     bufferlist osd_metadata;
3677     encode(m->metadata, osd_metadata);
3678     pending_metadata[from] = osd_metadata;
3679     pending_metadata_rm.erase(from);
3680
3681     // adjust last clean unmount epoch?
3682     const osd_info_t& info = osdmap.get_info(from);
3683     dout(10) << " old osd_info: " << info << dendl;
3684     if (m->sb.mounted > info.last_clean_begin ||
3685         (m->sb.mounted == info.last_clean_begin &&
3686          m->sb.clean_thru > info.last_clean_end)) {
3687       epoch_t begin = m->sb.mounted;
3688       epoch_t end = m->sb.clean_thru;
3689
3690       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3691                << "[" << info.last_clean_begin << "," << info.last_clean_end
3692                << ") -> [" << begin << "-" << end << ")"
3693                << dendl;
3694       pending_inc.new_last_clean_interval[from] =
3695         pair<epoch_t,epoch_t>(begin, end);
3696     }
3697
3698     if (pending_inc.new_xinfo.count(from) == 0)
3699       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3700     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3701     if (m->boot_epoch == 0) {
3702       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3703       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3704       dout(10) << " not laggy, new xi " << xi << dendl;
3705     } else {
3706       if (xi.down_stamp.sec()) {
3707         int interval = ceph_clock_now().sec() -
3708           xi.down_stamp.sec();
3709         if (g_conf()->mon_osd_laggy_max_interval &&
3710             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3711           interval =  g_conf()->mon_osd_laggy_max_interval;
3712         }
3713         xi.laggy_interval =
3714           interval * g_conf()->mon_osd_laggy_weight +
3715           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3716       }
3717       xi.laggy_probability =
3718         g_conf()->mon_osd_laggy_weight +
3719         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3720       dout(10) << " laggy, now xi " << xi << dendl;
3721     }
3722
3723     // set features shared by the osd
3724     if (m->osd_features)
3725       xi.features = m->osd_features;
3726     else
3727       xi.features = m->get_connection()->get_features();
3728
3729     // mark in?
3730     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3731          (oldstate & CEPH_OSD_AUTOOUT)) ||
3732         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3733         (g_conf()->mon_osd_auto_mark_in)) {
3734       if (can_mark_in(from)) {
3735         if (xi.old_weight > 0) {
3736           pending_inc.new_weight[from] = xi.old_weight;
3737           xi.old_weight = 0;
3738         } else {
3739           pending_inc.new_weight[from] = CEPH_OSD_IN;
3740         }
3741       } else {
3742         dout(7) << __func__ << " NOIN set, will not mark in "
3743                 << m->get_orig_source_addr() << dendl;
3744       }
3745     }
3746
3747     // wait
3748     wait_for_finished_proposal(op, new C_Booted(this, op));
3749   }
3750   return true;
3751 }
3752
3753 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3754 {
3755   op->mark_osdmon_event(__func__);
3756   auto m = op->get_req<MOSDBoot>();
3757   dout(7) << "_booted " << m->get_orig_source_inst()
3758           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3759
3760   if (logit) {
3761     mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3762                       << " boot";
3763   }
3764
3765   send_latest(op, m->sb.current_epoch+1);
3766 }
3767
3768
3769 // -------------
3770 // full
3771
3772 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3773 {
3774   op->mark_osdmon_event(__func__);
3775   auto m = op->get_req<MOSDFull>();
3776   int from = m->get_orig_source().num();
3777   set<string> state;
3778   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3779
3780   // check permissions, ignore if failed
3781   MonSession *session = op->get_session();
3782   if (!session)
3783     goto ignore;
3784   if (!session->is_capable("osd", MON_CAP_X)) {
3785     dout(0) << "MOSDFull from entity with insufficient privileges:"
3786             << session->caps << dendl;
3787     goto ignore;
3788   }
3789
3790   // ignore a full message from the osd instance that already went down
3791   if (!osdmap.exists(from)) {
3792     dout(7) << __func__ << " ignoring full message from nonexistent "
3793             << m->get_orig_source_inst() << dendl;
3794     goto ignore;
3795   }
3796   if ((!osdmap.is_up(from) &&
3797        osdmap.get_most_recent_addrs(from).legacy_equals(
3798          m->get_orig_source_addrs())) ||
3799       (osdmap.is_up(from) &&
3800        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3801     dout(7) << __func__ << " ignoring full message from down "
3802             << m->get_orig_source_inst() << dendl;
3803     goto ignore;
3804   }
3805
3806   OSDMap::calc_state_set(osdmap.get_state(from), state);
3807
3808   if ((osdmap.get_state(from) & mask) == m->state) {
3809     dout(7) << __func__ << " state already " << state << " for osd." << from
3810             << " " << m->get_orig_source_inst() << dendl;
3811     _reply_map(op, m->version);
3812     goto ignore;
3813   }
3814
3815   dout(10) << __func__ << " want state " << state << " for osd." << from
3816            << " " << m->get_orig_source_inst() << dendl;
3817   return false;
3818
3819  ignore:
3820   return true;
3821 }
3822
3823 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3824 {
3825   op->mark_osdmon_event(__func__);
3826   auto m = op->get_req<MOSDFull>();
3827   const int from = m->get_orig_source().num();
3828
3829   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3830   const unsigned want_state = m->state & mask;  // safety first
3831
3832   unsigned cur_state = osdmap.get_state(from);
3833   auto p = pending_inc.new_state.find(from);
3834   if (p != pending_inc.new_state.end()) {
3835     cur_state ^= p->second;
3836   }
3837   cur_state &= mask;
3838
3839   set<string> want_state_set, cur_state_set;
3840   OSDMap::calc_state_set(want_state, want_state_set);
3841   OSDMap::calc_state_set(cur_state, cur_state_set);
3842
3843   if (cur_state != want_state) {
3844     if (p != pending_inc.new_state.end()) {
3845       p->second &= ~mask;
3846     } else {
3847       pending_inc.new_state[from] = 0;
3848     }
3849     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3850     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3851             << " -> " << want_state_set << dendl;
3852   } else {
3853     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3854             << " = wanted " << want_state_set << ", just waiting" << dendl;
3855   }
3856
3857   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3858   return true;
3859 }
3860
3861 // -------------
3862 // alive
3863
3864 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3865 {
3866   op->mark_osdmon_event(__func__);
3867   auto m = op->get_req<MOSDAlive>();
3868   int from = m->get_orig_source().num();
3869
3870   // check permissions, ignore if failed
3871   MonSession *session = op->get_session();
3872   if (!session)
3873     goto ignore;
3874   if (!session->is_capable("osd", MON_CAP_X)) {
3875     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3876             << session->caps << dendl;
3877     goto ignore;
3878   }
3879
3880   if (!osdmap.is_up(from) ||
3881       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3882     dout(7) << "preprocess_alive ignoring alive message from down "
3883             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3884             << dendl;
3885     goto ignore;
3886   }
3887
3888   if (osdmap.get_up_thru(from) >= m->want) {
3889     // yup.
3890     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3891     _reply_map(op, m->version);
3892     return true;
3893   }
3894
3895   dout(10) << "preprocess_alive want up_thru " << m->want
3896            << " from " << m->get_orig_source_inst() << dendl;
3897   return false;
3898
3899  ignore:
3900   return true;
3901 }
3902
3903 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3904 {
3905   op->mark_osdmon_event(__func__);
3906   auto m = op->get_req<MOSDAlive>();
3907   int from = m->get_orig_source().num();
3908
3909   if (0) {  // we probably don't care much about these
3910     mon.clog->debug() << m->get_orig_source_inst() << " alive";
3911   }
3912
3913   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3914           << " from " << m->get_orig_source_inst() << dendl;
3915
3916   update_up_thru(from, m->version); // set to the latest map the OSD has
3917   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3918   return true;
3919 }
3920
3921 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3922 {
3923   op->mark_osdmon_event(__func__);
3924   dout(7) << "_reply_map " << e
3925           << " from " << op->get_req()->get_orig_source_inst()
3926           << dendl;
3927   send_latest(op, e);
3928 }
3929
3930 // pg_created
3931 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3932 {
3933   op->mark_osdmon_event(__func__);
3934   auto m  = op->get_req<MOSDPGCreated>();
3935   dout(10) << __func__ << " " << *m << dendl;
3936   auto session = op->get_session();
3937   mon.no_reply(op);
3938   if (!session) {
3939     dout(10) << __func__ << ": no monitor session!" << dendl;
3940     return true;
3941   }
3942   if (!session->is_capable("osd", MON_CAP_X)) {
3943     derr << __func__ << " received from entity "
3944          << "with insufficient privileges " << session->caps << dendl;
3945     return true;
3946   }
3947   // always forward the "created!" to the leader
3948   return false;
3949 }
3950
3951 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3952 {
3953   op->mark_osdmon_event(__func__);
3954   auto m = op->get_req<MOSDPGCreated>();
3955   dout(10) << __func__ << " " << *m << dendl;
3956   auto src = m->get_orig_source();
3957   auto from = src.num();
3958   if (!src.is_osd() ||
3959       !mon.osdmon()->osdmap.is_up(from) ||
3960       !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3961         m->get_orig_source_addrs())) {
3962     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3963     return false;
3964   }
3965   pending_created_pgs.push_back(m->pgid);
3966   return true;
3967 }
3968
3969 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3970 {
3971   op->mark_osdmon_event(__func__);
3972   auto m = op->get_req<MOSDPGReadyToMerge>();
3973   dout(10) << __func__ << " " << *m << dendl;
3974   const pg_pool_t *pi;
3975   auto session = op->get_session();
3976   if (!session) {
3977     dout(10) << __func__ << ": no monitor session!" << dendl;
3978     goto ignore;
3979   }
3980   if (!session->is_capable("osd", MON_CAP_X)) {
3981     derr << __func__ << " received from entity "
3982          << "with insufficient privileges " << session->caps << dendl;
3983     goto ignore;
3984   }
3985   pi = osdmap.get_pg_pool(m->pgid.pool());
3986   if (!pi) {
3987     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3988     goto ignore;
3989   }
3990   if (pi->get_pg_num() <= m->pgid.ps()) {
3991     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3992     goto ignore;
3993   }
3994   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3995     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3996     goto ignore;
3997   }
3998   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3999     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4000     goto ignore;
4001   }
4002   return false;
4003
4004  ignore:
4005   mon.no_reply(op);
4006   return true;
4007 }
4008
4009 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4010 {
4011   op->mark_osdmon_event(__func__);
4012   auto m  = op->get_req<MOSDPGReadyToMerge>();
4013   dout(10) << __func__ << " " << *m << dendl;
4014   pg_pool_t p;
4015   if (pending_inc.new_pools.count(m->pgid.pool()))
4016     p = pending_inc.new_pools[m->pgid.pool()];
4017   else
4018     p = *osdmap.get_pg_pool(m->pgid.pool());
4019   if (p.get_pg_num() != m->pgid.ps() + 1 ||
4020       p.get_pg_num_pending() > m->pgid.ps()) {
4021     dout(10) << __func__
4022              << " race with concurrent pg_num[_pending] update, will retry"
4023              << dendl;
4024     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4025     return true;
4026   }
4027
4028   if (m->ready) {
4029     p.dec_pg_num(m->pgid,
4030                  pending_inc.epoch,
4031                  m->source_version,
4032                  m->target_version,
4033                  m->last_epoch_started,
4034                  m->last_epoch_clean);
4035     p.last_change = pending_inc.epoch;
4036   } else {
4037     // back off the merge attempt!
4038     p.set_pg_num_pending(p.get_pg_num());
4039   }
4040
4041   // force pre-nautilus clients to resend their ops, since they
4042   // don't understand pg_num_pending changes form a new interval
4043   p.last_force_op_resend_prenautilus = pending_inc.epoch;
4044
4045   pending_inc.new_pools[m->pgid.pool()] = p;
4046
4047   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4048   if (m->ready &&
4049       prob > 0 &&
4050       prob > (double)(rand() % 1000)/1000.0) {
4051     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4052     auto n = new MMonCommand(mon.monmap->get_fsid());
4053     n->set_connection(m->get_connection());
4054     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4055                osdmap.get_pool_name(m->pgid.pool()) +
4056                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4057                stringify(m->pgid.ps() + 1) + "\"}" };
4058     MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4059     nop->set_type_service();
4060     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4061   } else {
4062     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4063   }
4064   return true;
4065 }
4066
4067
4068 // -------------
4069 // pg_temp changes
4070
4071 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4072 {
4073   auto m = op->get_req<MOSDPGTemp>();
4074   dout(10) << "preprocess_pgtemp " << *m << dendl;
4075   mempool::osdmap::vector<int> empty;
4076   int from = m->get_orig_source().num();
4077   size_t ignore_cnt = 0;
4078
4079   // check caps
4080   MonSession *session = op->get_session();
4081   if (!session)
4082     goto ignore;
4083   if (!session->is_capable("osd", MON_CAP_X)) {
4084     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4085             << session->caps << dendl;
4086     goto ignore;
4087   }
4088
4089   if (!osdmap.is_up(from) ||
4090       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4091     dout(7) << "ignoring pgtemp message from down "
4092             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4093             << dendl;
4094     goto ignore;
4095   }
4096
4097   if (m->forced) {
4098     return false;
4099   }
4100
4101   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4102     dout(20) << " " << p->first
4103              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4104              << " -> " << p->second << dendl;
4105
4106     // does the pool exist?
4107     if (!osdmap.have_pg_pool(p->first.pool())) {
4108       /*
4109        * 1. If the osdmap does not have the pool, it means the pool has been
4110        *    removed in-between the osd sending this message and us handling it.
4111        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4112        *    not exist in the pending either, as the osds would not send a
4113        *    message about a pool they know nothing about (yet).
4114        * 3. However, if the pool does exist in the pending, then it must be a
4115        *    new pool, and not relevant to this message (see 1).
4116        */
4117       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4118                << ": pool has been removed" << dendl;
4119       ignore_cnt++;
4120       continue;
4121     }
4122
4123     int acting_primary = -1;
4124     osdmap.pg_to_up_acting_osds(
4125       p->first, nullptr, nullptr, nullptr, &acting_primary);
4126     if (acting_primary != from) {
4127       /* If the source isn't the primary based on the current osdmap, we know
4128        * that the interval changed and that we can discard this message.
4129        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4130        * which of two pg temp mappings on the same pg is more recent.
4131        */
4132       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4133                << ": primary has changed" << dendl;
4134       ignore_cnt++;
4135       continue;
4136     }
4137
4138     // removal?
4139     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4140                               osdmap.primary_temp->count(p->first)))
4141       return false;
4142     // change?
4143     //  NOTE: we assume that this will clear pg_primary, so consider
4144     //        an existing pg_primary field to imply a change
4145     if (p->second.size() &&
4146         (osdmap.pg_temp->count(p->first) == 0 ||
4147          osdmap.pg_temp->get(p->first) != p->second ||
4148          osdmap.primary_temp->count(p->first)))
4149       return false;
4150   }
4151
4152   // should we ignore all the pgs?
4153   if (ignore_cnt == m->pg_temp.size())
4154     goto ignore;
4155
4156   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4157   _reply_map(op, m->map_epoch);
4158   return true;
4159
4160  ignore:
4161   mon.no_reply(op);
4162   return true;
4163 }
4164
4165 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4166 {
4167   epoch_t old_up_thru = osdmap.get_up_thru(from);
4168   auto ut = pending_inc.new_up_thru.find(from);
4169   if (ut != pending_inc.new_up_thru.end()) {
4170     old_up_thru = ut->second;
4171   }
4172   if (up_thru > old_up_thru) {
4173     // set up_thru too, so the osd doesn't have to ask again
4174     pending_inc.new_up_thru[from] = up_thru;
4175   }
4176 }
4177
4178 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4179 {
4180   op->mark_osdmon_event(__func__);
4181   auto m = op->get_req<MOSDPGTemp>();
4182   int from = m->get_orig_source().num();
4183   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4184   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4185     uint64_t pool = p->first.pool();
4186     if (pending_inc.old_pools.count(pool)) {
4187       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4188                << ": pool pending removal" << dendl;
4189       continue;
4190     }
4191     if (!osdmap.have_pg_pool(pool)) {
4192       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4193                << ": pool has been removed" << dendl;
4194       continue;
4195     }
4196     pending_inc.new_pg_temp[p->first] =
4197       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4198
4199     // unconditionally clear pg_primary (until this message can encode
4200     // a change for that, too.. at which point we need to also fix
4201     // preprocess_pg_temp)
4202     if (osdmap.primary_temp->count(p->first) ||
4203         pending_inc.new_primary_temp.count(p->first))
4204       pending_inc.new_primary_temp[p->first] = -1;
4205   }
4206
4207   // set up_thru too, so the osd doesn't have to ask again
4208   update_up_thru(from, m->map_epoch);
4209
4210   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4211   return true;
4212 }
4213
4214
4215 // ---
4216
4217 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4218 {
4219   op->mark_osdmon_event(__func__);
4220   auto m = op->get_req<MRemoveSnaps>();
4221   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4222
4223   // check privilege, ignore if failed
4224   MonSession *session = op->get_session();
4225   mon.no_reply(op);
4226   if (!session)
4227     goto ignore;
4228   if (!session->caps.is_capable(
4229         cct,
4230         session->entity_name,
4231         "osd", "osd pool rmsnap", {}, true, true, false,
4232         session->get_peer_socket_addr())) {
4233     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4234             << session->caps << dendl;
4235     goto ignore;
4236   }
4237
4238   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4239        q != m->snaps.end();
4240        ++q) {
4241     if (!osdmap.have_pg_pool(q->first)) {
4242       dout(10) << " ignoring removed_snaps " << q->second
4243                << " on non-existent pool " << q->first << dendl;
4244       continue;
4245     }
4246     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4247     for (vector<snapid_t>::iterator p = q->second.begin();
4248          p != q->second.end();
4249          ++p) {
4250       if (*p > pi->get_snap_seq() ||
4251           !_is_removed_snap(q->first, *p)) {
4252         return false;
4253       }
4254     }
4255   }
4256
4257   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4258     auto reply = make_message<MRemoveSnaps>();
4259     reply->snaps = m->snaps;
4260     mon.send_reply(op, reply.detach());
4261   }
4262
4263  ignore:
4264   return true;
4265 }
4266
4267 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4268 {
4269   op->mark_osdmon_event(__func__);
4270   auto m = op->get_req<MRemoveSnaps>();
4271   dout(7) << "prepare_remove_snaps " << *m << dendl;
4272
4273   for (auto& [pool, snaps] : m->snaps) {
4274     if (!osdmap.have_pg_pool(pool)) {
4275       dout(10) << " ignoring removed_snaps " << snaps
4276                << " on non-existent pool " << pool << dendl;
4277       continue;
4278     }
4279
4280     pg_pool_t& pi = osdmap.pools[pool];
4281     for (auto s : snaps) {
4282       if (!_is_removed_snap(pool, s) &&
4283           (!pending_inc.new_pools.count(pool) ||
4284            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4285           (!pending_inc.new_removed_snaps.count(pool) ||
4286            !pending_inc.new_removed_snaps[pool].contains(s))) {
4287         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4288         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4289           newpi->removed_snaps.insert(s);
4290           dout(10) << " pool " << pool << " removed_snaps added " << s
4291                    << " (now " << newpi->removed_snaps << ")" << dendl;
4292         }
4293         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4294         if (s > newpi->get_snap_seq()) {
4295           dout(10) << " pool " << pool << " snap_seq "
4296                    << newpi->get_snap_seq() << " -> " << s << dendl;
4297           newpi->set_snap_seq(s);
4298         }
4299         newpi->set_snap_epoch(pending_inc.epoch);
4300         dout(10) << " added pool " << pool << " snap " << s
4301                  << " to removed_snaps queue" << dendl;
4302         pending_inc.new_removed_snaps[pool].insert(s);
4303       }
4304     }
4305   }
4306
4307   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4308     auto reply = make_message<MRemoveSnaps>();
4309     reply->snaps = m->snaps;
4310     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4311   }
4312
4313   return true;
4314 }
4315
4316 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4317 {
4318   op->mark_osdmon_event(__func__);
4319   auto m = op->get_req<MMonGetPurgedSnaps>();
4320   dout(7) << __func__ << " " << *m << dendl;
4321
4322   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4323
4324   string k = make_purged_snap_epoch_key(m->start);
4325   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4326   it->upper_bound(k);
4327   unsigned long epoch = m->last;
4328   while (it->valid()) {
4329     if (it->key().find("purged_epoch_") != 0) {
4330       break;
4331     }
4332     string k = it->key();
4333     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4334     if (n != 1) {
4335       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4336     } else if (epoch > m->last) {
4337       break;
4338     } else {
4339       bufferlist bl = it->value();
4340       auto p = bl.cbegin();
4341       auto &v = r[epoch];
4342       try {
4343         ceph::decode(v, p);
4344       } catch (ceph::buffer::error& e) {
4345         derr << __func__ << " unable to parse value for key '" << it->key()
4346              << "': \n";
4347         bl.hexdump(*_dout);
4348         *_dout << dendl;
4349       }
4350       n += 4 + v.size() * 16;
4351     }
4352     if (n > 1048576) {
4353       // impose a semi-arbitrary limit to message size
4354       break;
4355     }
4356     it->next();
4357   }
4358
4359   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4360   reply->purged_snaps.swap(r);
4361   mon.send_reply(op, reply.detach());
4362
4363   return true;
4364 }
4365
4366 // osd beacon
4367 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4368 {
4369   op->mark_osdmon_event(__func__);
4370   // check caps
4371   auto session = op->get_session();
4372   mon.no_reply(op);
4373   if (!session) {
4374     dout(10) << __func__ << " no monitor session!" << dendl;
4375     return true;
4376   }
4377   if (!session->is_capable("osd", MON_CAP_X)) {
4378     derr << __func__ << " received from entity "
4379          << "with insufficient privileges " << session->caps << dendl;
4380     return true;
4381   }
4382   // Always forward the beacon to the leader, even if they are the same as
4383   // the old one. The leader will mark as down osds that haven't sent
4384   // beacon for a few minutes.
4385   return false;
4386 }
4387
4388 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4389 {
4390   op->mark_osdmon_event(__func__);
4391   const auto beacon = op->get_req<MOSDBeacon>();
4392   const auto src = beacon->get_orig_source();
4393   dout(10) << __func__ << " " << *beacon
4394            << " from " << src << dendl;
4395   int from = src.num();
4396
4397   if (!src.is_osd() ||
4398       !osdmap.is_up(from) ||
4399       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4400     if (src.is_osd() && !osdmap.is_up(from)) {
4401       // share some new maps with this guy in case it may not be
4402       // aware of its own deadness...
4403       send_latest(op, beacon->version+1);
4404     }
4405     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4406     return false;
4407   }
4408
4409   last_osd_report[from].first = ceph_clock_now();
4410   last_osd_report[from].second = beacon->osd_beacon_report_interval;
4411   osd_epochs[from] = beacon->version;
4412
4413   for (const auto& pg : beacon->pgs) {
4414     if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4415       unsigned pg_num = pool->get_pg_num();
4416       last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4417     }
4418   }
4419
4420   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4421       beacon->last_purged_snaps_scrub) {
4422     if (pending_inc.new_xinfo.count(from) == 0) {
4423       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4424     }
4425     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4426       beacon->last_purged_snaps_scrub;
4427     return true;
4428   } else {
4429     return false;
4430   }
4431 }
4432
4433 // ---------------
4434 // map helpers
4435
4436 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4437 {
4438   op->mark_osdmon_event(__func__);
4439   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4440           << " start " << start << dendl;
4441   if (start == 0)
4442     send_full(op);
4443   else
4444     send_incremental(op, start);
4445 }
4446
4447
4448 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4449 {
4450   MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4451   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4452   r->oldest_map = get_first_committed();
4453   r->newest_map = osdmap.get_epoch();
4454   return r;
4455 }
4456
4457 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4458 {
4459   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4460            << std::hex << features << std::dec << dendl;
4461   MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4462   m->oldest_map = get_first_committed();
4463   m->newest_map = osdmap.get_epoch();
4464
4465   for (epoch_t e = to; e >= from && e > 0; e--) {
4466     bufferlist bl;
4467     int err = get_version(e, features, bl);
4468     if (err == 0) {
4469       ceph_assert(bl.length());
4470       // if (get_version(e, bl) > 0) {
4471       dout(20) << "build_incremental    inc " << e << " "
4472                << bl.length() << " bytes" << dendl;
4473       m->incremental_maps[e] = bl;
4474     } else {
4475       ceph_assert(err == -ENOENT);
4476       ceph_assert(!bl.length());
4477       get_version_full(e, features, bl);
4478       if (bl.length() > 0) {
4479       //else if (get_version("full", e, bl) > 0) {
4480       dout(20) << "build_incremental   full " << e << " "
4481                << bl.length() << " bytes" << dendl;
4482       m->maps[e] = bl;
4483       } else {
4484         ceph_abort();  // we should have all maps.
4485       }
4486     }
4487   }
4488   return m;
4489 }
4490
4491 void OSDMonitor::send_full(MonOpRequestRef op)
4492 {
4493   op->mark_osdmon_event(__func__);
4494   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4495   mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4496 }
4497
4498 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4499 {
4500   op->mark_osdmon_event(__func__);
4501
4502   MonSession *s = op->get_session();
4503   ceph_assert(s);
4504
4505   if (s->proxy_con) {
4506     // oh, we can tell the other mon to do it
4507     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4508              << first << dendl;
4509     MRoute *r = new MRoute(s->proxy_tid, NULL);
4510     r->send_osdmap_first = first;
4511     s->proxy_con->send_message(r);
4512     op->mark_event("reply: send routed send_osdmap_first reply");
4513   } else {
4514     // do it ourselves
4515     send_incremental(first, s, false, op);
4516   }
4517 }
4518
4519 void OSDMonitor::send_incremental(epoch_t first,
4520                                   MonSession *session,
4521                                   bool onetime,
4522                                   MonOpRequestRef req)
4523 {
4524   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4525           << " to " << session->name << dendl;
4526
4527   // get feature of the peer
4528   // use quorum_con_features, if it's an anonymous connection.
4529   uint64_t features = session->con_features ? session->con_features :
4530     mon.get_quorum_con_features();
4531
4532   if (first <= session->osd_epoch) {
4533     dout(10) << __func__ << " " << session->name << " should already have epoch "
4534              << session->osd_epoch << dendl;
4535     first = session->osd_epoch + 1;
4536   }
4537
4538   if (first < get_first_committed()) {
4539     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4540     m->oldest_map = get_first_committed();
4541     m->newest_map = osdmap.get_epoch();
4542
4543     first = get_first_committed();
4544     bufferlist bl;
4545     int err = get_version_full(first, features, bl);
4546     ceph_assert(err == 0);
4547     ceph_assert(bl.length());
4548     dout(20) << "send_incremental starting with base full "
4549              << first << " " << bl.length() << " bytes" << dendl;
4550     m->maps[first] = bl;
4551
4552     if (req) {
4553       mon.send_reply(req, m);
4554       session->osd_epoch = first;
4555       return;
4556     } else {
4557       session->con->send_message(m);
4558       session->osd_epoch = first;
4559     }
4560     first++;
4561   }
4562
4563   while (first <= osdmap.get_epoch()) {
4564     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4565                                      osdmap.get_epoch());
4566     MOSDMap *m = build_incremental(first, last, features);
4567
4568     if (req) {
4569       // send some maps.  it may not be all of them, but it will get them
4570       // started.
4571       mon.send_reply(req, m);
4572     } else {
4573       session->con->send_message(m);
4574       first = last + 1;
4575     }
4576     session->osd_epoch = last;
4577     if (onetime || req)
4578       break;
4579   }
4580 }
4581
4582 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4583 {
4584   return get_version(ver, mon.get_quorum_con_features(), bl);
4585 }
4586
4587 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4588 {
4589   OSDMap::Incremental inc;
4590   auto q = bl.cbegin();
4591   inc.decode(q);
4592   // always encode with subset of osdmap's canonical features
4593   uint64_t f = features & inc.encode_features;
4594   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4595            << dendl;
4596   bl.clear();
4597   if (inc.fullmap.length()) {
4598     // embedded full map?
4599     OSDMap m;
4600     m.decode(inc.fullmap);
4601     inc.fullmap.clear();
4602     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4603   }
4604   if (inc.crush.length()) {
4605     // embedded crush map
4606     CrushWrapper c;
4607     auto p = inc.crush.cbegin();
4608     c.decode(p);
4609     inc.crush.clear();
4610     c.encode(inc.crush, f);
4611   }
4612   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4613 }
4614
4615 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4616 {
4617   OSDMap m;
4618   auto q = bl.cbegin();
4619   m.decode(q);
4620   // always encode with subset of osdmap's canonical features
4621   uint64_t f = features & m.get_encoding_features();
4622   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4623            << dendl;
4624   bl.clear();
4625   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4626 }
4627
4628 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4629 {
4630   uint64_t significant_features = OSDMap::get_significant_features(features);
4631   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4632     return 0;
4633   }
4634   int ret = PaxosService::get_version(ver, bl);
4635   if (ret < 0) {
4636     return ret;
4637   }
4638   // NOTE: this check is imprecise; the OSDMap encoding features may
4639   // be a subset of the latest mon quorum features, but worst case we
4640   // reencode once and then cache the (identical) result under both
4641   // feature masks.
4642   if (significant_features !=
4643       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4644     reencode_incremental_map(bl, features);
4645   }
4646   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4647   return 0;
4648 }
4649
4650 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4651 {
4652   bufferlist inc_bl;
4653   int err = get_version(ver, inc_bl);
4654   ceph_assert(err == 0);
4655   ceph_assert(inc_bl.length());
4656
4657   auto p = inc_bl.cbegin();
4658   inc.decode(p);
4659   dout(10) << __func__ << "     "
4660            << " epoch " << inc.epoch
4661            << " inc_crc " << inc.inc_crc
4662            << " full_crc " << inc.full_crc
4663            << " encode_features " << inc.encode_features << dendl;
4664   return 0;
4665 }
4666
4667 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4668 {
4669   dout(10) << __func__ << " ver " << ver << dendl;
4670
4671   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4672   if (closest_pinned == 0) {
4673     return -ENOENT;
4674   }
4675   if (closest_pinned > ver) {
4676     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4677   }
4678   ceph_assert(closest_pinned <= ver);
4679
4680   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4681
4682   // get osdmap incremental maps and apply on top of this one.
4683   bufferlist osdm_bl;
4684   bool has_cached_osdmap = false;
4685   for (version_t v = ver-1; v >= closest_pinned; --v) {
4686     if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4687                                 &osdm_bl)) {
4688       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4689       closest_pinned = v;
4690       has_cached_osdmap = true;
4691       break;
4692     }
4693   }
4694
4695   if (!has_cached_osdmap) {
4696     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4697     if (err != 0) {
4698       derr << __func__ << " closest pinned map ver " << closest_pinned
4699            << " not available! error: " << cpp_strerror(err) << dendl;
4700     }
4701     ceph_assert(err == 0);
4702   }
4703
4704   ceph_assert(osdm_bl.length());
4705
4706   OSDMap osdm;
4707   osdm.decode(osdm_bl);
4708
4709   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4710            << " e" << osdm.epoch
4711            << " crc " << osdm.get_crc()
4712            << " -- applying incremental maps." << dendl;
4713
4714   uint64_t encode_features = 0;
4715   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4716     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4717
4718     OSDMap::Incremental inc;
4719     int err = get_inc(v, inc);
4720     ceph_assert(err == 0);
4721
4722     encode_features = inc.encode_features;
4723
4724     err = osdm.apply_incremental(inc);
4725     ceph_assert(err == 0);
4726
4727     // this block performs paranoid checks on map retrieval
4728     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4729         inc.full_crc != 0) {
4730
4731       uint64_t f = encode_features;
4732       if (!f) {
4733         f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4734       }
4735
4736       // encode osdmap to force calculating crcs
4737       bufferlist tbl;
4738       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4739       // decode osdmap to compare crcs with what's expected by incremental
4740       OSDMap tosdm;
4741       tosdm.decode(tbl);
4742
4743       if (tosdm.get_crc() != inc.full_crc) {
4744         derr << __func__
4745              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4746              << ", expected " << inc.full_crc << ")" << dendl;
4747         ceph_abort_msg("osdmap crc mismatch");
4748       }
4749     }
4750
4751     // note: we cannot add the recently computed map to the cache, as is,
4752     // because we have not encoded the map into a bl.
4753   }
4754
4755   if (!encode_features) {
4756     dout(10) << __func__
4757              << " last incremental map didn't have features;"
4758              << " defaulting to quorum's or all" << dendl;
4759     encode_features =
4760       (mon.quorum_con_features ? mon.quorum_con_features : -1);
4761   }
4762   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4763
4764   return 0;
4765 }
4766
4767 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4768 {
4769   return get_version_full(ver, mon.get_quorum_con_features(), bl);
4770 }
4771
4772 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4773                                  bufferlist& bl)
4774 {
4775   uint64_t significant_features = OSDMap::get_significant_features(features);
4776   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4777     return 0;
4778   }
4779   int ret = PaxosService::get_version_full(ver, bl);
4780   if (ret == -ENOENT) {
4781     // build map?
4782     ret = get_full_from_pinned_map(ver, bl);
4783   }
4784   if (ret < 0) {
4785     return ret;
4786   }
4787   // NOTE: this check is imprecise; the OSDMap encoding features may
4788   // be a subset of the latest mon quorum features, but worst case we
4789   // reencode once and then cache the (identical) result under both
4790   // feature masks.
4791   if (significant_features !=
4792       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4793     reencode_full_map(bl, features);
4794   }
4795   full_osd_cache.add_bytes({ver, significant_features}, bl);
4796   return 0;
4797 }
4798
4799 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4800 {
4801   dout(10) << "blocklist " << av << " until " << until << dendl;
4802   for (auto a : av.v) {
4803     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4804       a.set_type(entity_addr_t::TYPE_ANY);
4805     } else {
4806       a.set_type(entity_addr_t::TYPE_LEGACY);
4807     }
4808     pending_inc.new_blocklist[a] = until;
4809   }
4810   return pending_inc.epoch;
4811 }
4812
4813 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4814 {
4815   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4816     a.set_type(entity_addr_t::TYPE_ANY);
4817   } else {
4818     a.set_type(entity_addr_t::TYPE_LEGACY);
4819   }
4820   dout(10) << "blocklist " << a << " until " << until << dendl;
4821   pending_inc.new_blocklist[a] = until;
4822   return pending_inc.epoch;
4823 }
4824
4825
4826 void OSDMonitor::check_osdmap_subs()
4827 {
4828   dout(10) << __func__ << dendl;
4829   if (!osdmap.get_epoch()) {
4830     return;
4831   }
4832   auto osdmap_subs = mon.session_map.subs.find("osdmap");
4833   if (osdmap_subs == mon.session_map.subs.end()) {
4834     return;
4835   }
4836   auto p = osdmap_subs->second->begin();
4837   while (!p.end()) {
4838     auto sub = *p;
4839     ++p;
4840     check_osdmap_sub(sub);
4841   }
4842 }
4843
4844 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4845 {
4846   dout(10) << __func__ << " " << sub << " next " << sub->next
4847            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4848   if (sub->next <= osdmap.get_epoch()) {
4849     if (sub->next >= 1)
4850       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4851     else
4852       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4853     if (sub->onetime)
4854       mon.session_map.remove_sub(sub);
4855     else
4856       sub->next = osdmap.get_epoch() + 1;
4857   }
4858 }
4859
4860 void OSDMonitor::check_pg_creates_subs()
4861 {
4862   if (!osdmap.get_num_up_osds()) {
4863     return;
4864   }
4865   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4866   mon.with_session_map([this](const MonSessionMap& session_map) {
4867       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4868       if (pg_creates_subs == session_map.subs.end()) {
4869         return;
4870       }
4871       for (auto sub : *pg_creates_subs->second) {
4872         check_pg_creates_sub(sub);
4873       }
4874     });
4875 }
4876
4877 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4878 {
4879   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4880   ceph_assert(sub->type == "osd_pg_creates");
4881   // only send these if the OSD is up.  we will check_subs() when they do
4882   // come up so they will get the creates then.
4883   if (sub->session->name.is_osd() &&
4884       mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4885     sub->next = send_pg_creates(sub->session->name.num(),
4886                                 sub->session->con.get(),
4887                                 sub->next);
4888   }
4889 }
4890
4891 void OSDMonitor::do_application_enable(int64_t pool_id,
4892                                        const std::string &app_name,
4893                                        const std::string &app_key,
4894                                        const std::string &app_value,
4895                                        bool force)
4896 {
4897   ceph_assert(paxos.is_plugged() && is_writeable());
4898
4899   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4900            << dendl;
4901
4902   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4903
4904   auto pp = osdmap.get_pg_pool(pool_id);
4905   ceph_assert(pp != nullptr);
4906
4907   pg_pool_t p = *pp;
4908   if (pending_inc.new_pools.count(pool_id)) {
4909     p = pending_inc.new_pools[pool_id];
4910   }
4911
4912   if (app_key.empty()) {
4913     p.application_metadata.insert({app_name, {}});
4914   } else {
4915     if (force) {
4916       p.application_metadata[app_name][app_key] = app_value;
4917     } else {
4918       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4919     }
4920   }
4921   p.last_change = pending_inc.epoch;
4922   pending_inc.new_pools[pool_id] = p;
4923 }
4924
4925 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4926                                  pool_opts_t::key_t opt,
4927                                  pool_opts_t::value_t val)
4928 {
4929   auto p = pending_inc.new_pools.try_emplace(
4930     pool_id, *osdmap.get_pg_pool(pool_id));
4931   p.first->second.opts.set(opt, val);
4932 }
4933
4934 unsigned OSDMonitor::scan_for_creating_pgs(
4935   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4936   const mempool::osdmap::set<int64_t>& removed_pools,
4937   utime_t modified,
4938   creating_pgs_t* creating_pgs) const
4939 {
4940   unsigned queued = 0;
4941   for (auto& p : pools) {
4942     int64_t poolid = p.first;
4943     if (creating_pgs->created_pools.count(poolid)) {
4944       dout(10) << __func__ << " already created " << poolid << dendl;
4945       continue;
4946     }
4947     const pg_pool_t& pool = p.second;
4948     int ruleno = pool.get_crush_rule();
4949     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4950       continue;
4951
4952     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4953     const auto created = pool.get_last_change();
4954     if (last_scan_epoch && created <= last_scan_epoch) {
4955       dout(10) << __func__ << " no change in pool " << poolid
4956                << " " << pool << dendl;
4957       continue;
4958     }
4959     if (removed_pools.count(poolid)) {
4960       dout(10) << __func__ << " pool is being removed: " << poolid
4961                << " " << pool << dendl;
4962       continue;
4963     }
4964     dout(10) << __func__ << " queueing pool create for " << poolid
4965              << " " << pool << dendl;
4966     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4967                               created, modified);
4968     queued++;
4969   }
4970   return queued;
4971 }
4972
4973 void OSDMonitor::update_creating_pgs()
4974 {
4975   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4976            << creating_pgs.queue.size() << " pools in queue" << dendl;
4977   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4978   std::lock_guard<std::mutex> l(creating_pgs_lock);
4979   for (const auto& pg : creating_pgs.pgs) {
4980     int acting_primary = -1;
4981     auto pgid = pg.first;
4982     if (!osdmap.pg_exists(pgid)) {
4983       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4984                << dendl;
4985       continue;
4986     }
4987     auto mapped = pg.second.create_epoch;
4988     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4989     spg_t spgid(pgid);
4990     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4991     // check the previous creating_pgs, look for the target to whom the pg was
4992     // previously mapped
4993     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4994       const auto last_acting_primary = pgs_by_epoch.first;
4995       for (auto& pgs: pgs_by_epoch.second) {
4996         if (pgs.second.count(spgid)) {
4997           if (last_acting_primary == acting_primary) {
4998             mapped = pgs.first;
4999           } else {
5000             dout(20) << __func__ << " " << pgid << " "
5001                      << " acting_primary:" << last_acting_primary
5002                      << " -> " << acting_primary << dendl;
5003             // note epoch if the target of the create message changed.
5004             mapped = mapping.get_epoch();
5005           }
5006           break;
5007         } else {
5008           // newly creating
5009           mapped = mapping.get_epoch();
5010         }
5011       }
5012     }
5013     dout(10) << __func__ << " will instruct osd." << acting_primary
5014              << " to create " << pgid << "@" << mapped << dendl;
5015     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5016   }
5017   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5018   creating_pgs_epoch = mapping.get_epoch();
5019 }
5020
5021 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5022 {
5023   dout(30) << __func__ << " osd." << osd << " next=" << next
5024            << " " << creating_pgs_by_osd_epoch << dendl;
5025   std::lock_guard<std::mutex> l(creating_pgs_lock);
5026   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5027     dout(20) << __func__
5028              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5029     // the subscribers will be updated when the mapping is completed anyway
5030     return next;
5031   }
5032   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5033   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5034     return next;
5035   ceph_assert(!creating_pgs_by_epoch->second.empty());
5036
5037   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5038   MOSDPGCreate2 *m = nullptr;
5039
5040   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5041
5042   epoch_t last = 0;
5043   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5044        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5045     auto epoch = epoch_pgs->first;
5046     auto& pgs = epoch_pgs->second;
5047     dout(20) << __func__ << " osd." << osd << " from " << next
5048              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5049     last = epoch;
5050     for (auto& pg : pgs) {
5051       // Need the create time from the monitor using its clock to set
5052       // last_scrub_stamp upon pg creation.
5053       auto create = creating_pgs.pgs.find(pg.pgid);
5054       ceph_assert(create != creating_pgs.pgs.end());
5055       if (old) {
5056         if (!oldm) {
5057           oldm = new MOSDPGCreate(creating_pgs_epoch);
5058         }
5059         oldm->mkpg.emplace(pg.pgid,
5060                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
5061         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5062       } else {
5063         if (!m) {
5064           m = new MOSDPGCreate2(creating_pgs_epoch);
5065         }
5066         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5067                                      create->second.create_stamp));
5068         if (create->second.history.epoch_created) {
5069           dout(20) << __func__ << "   " << pg << " " << create->second.history
5070                    << " " << create->second.past_intervals << dendl;
5071           m->pg_extra.emplace(pg, make_pair(create->second.history,
5072                                             create->second.past_intervals));
5073         }
5074       }
5075       dout(20) << __func__ << " will create " << pg
5076                << " at " << create->second.create_epoch << dendl;
5077     }
5078   }
5079   if (m) {
5080     con->send_message(m);
5081   } else if (oldm) {
5082     con->send_message(oldm);
5083   } else {
5084     dout(20) << __func__ << " osd." << osd << " from " << next
5085              << " has nothing to send" << dendl;
5086     return next;
5087   }
5088
5089   // sub is current through last + 1
5090   return last + 1;
5091 }
5092
5093 // TICK
5094
5095
5096 void OSDMonitor::tick()
5097 {
5098   if (!is_active()) return;
5099
5100   dout(10) << osdmap << dendl;
5101
5102   // always update osdmap manifest, regardless of being the leader.
5103   load_osdmap_manifest();
5104
5105   // always tune priority cache manager memory on leader and peons
5106   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5107     std::lock_guard l(balancer_lock);
5108     if (pcm != nullptr) {
5109       pcm->tune_memory();
5110       pcm->balance();
5111       _set_new_cache_sizes();
5112       dout(10) << "tick balancer "
5113                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5114                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5115                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5116                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5117                << dendl;
5118       dout(10) << "tick balancer "
5119                << " full cache_bytes: " << full_cache->get_cache_bytes()
5120                << " full comtd_bytes: " << full_cache->get_committed_size()
5121                << " full used_bytes: " << full_cache->_get_used_bytes()
5122                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5123                << dendl;
5124     }
5125   }
5126
5127   if (!mon.is_leader()) return;
5128
5129   bool do_propose = false;
5130   utime_t now = ceph_clock_now();
5131
5132   if (handle_osd_timeouts(now, last_osd_report)) {
5133     do_propose = true;
5134   }
5135
5136   // mark osds down?
5137   if (check_failures(now)) {
5138     do_propose = true;
5139   }
5140
5141   // Force a proposal if we need to prune; pruning is performed on
5142   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5143   // even if there's nothing going on.
5144   if (is_prune_enabled() && should_prune()) {
5145     do_propose = true;
5146   }
5147
5148   // mark down osds out?
5149
5150   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5151    * influence at all. The decision is made based on the ratio of "in" osds,
5152    * and the function returns false if this ratio is lower that the minimum
5153    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5154    */
5155   if (can_mark_out(-1)) {
5156     string down_out_subtree_limit = g_conf().get_val<string>(
5157       "mon_osd_down_out_subtree_limit");
5158     set<int> down_cache;  // quick cache of down subtrees
5159
5160     map<int,utime_t>::iterator i = down_pending_out.begin();
5161     while (i != down_pending_out.end()) {
5162       int o = i->first;
5163       utime_t down = now;
5164       down -= i->second;
5165       ++i;
5166
5167       if (osdmap.is_down(o) &&
5168           osdmap.is_in(o) &&
5169           can_mark_out(o)) {
5170         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5171         utime_t grace = orig_grace;
5172         double my_grace = 0.0;
5173
5174         if (g_conf()->mon_osd_adjust_down_out_interval) {
5175           // scale grace period the same way we do the heartbeat grace.
5176           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5177           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5178           double decay_k = ::log(.5) / halflife;
5179           double decay = exp((double)down * decay_k);
5180           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5181                    << " down for " << down << " decay " << decay << dendl;
5182           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5183           grace += my_grace;
5184         }
5185
5186         // is this an entire large subtree down?
5187         if (down_out_subtree_limit.length()) {
5188           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5189           if (type > 0) {
5190             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5191               dout(10) << "tick entire containing " << down_out_subtree_limit
5192                        << " subtree for osd." << o
5193                        << " is down; resetting timer" << dendl;
5194               // reset timer, too.
5195               down_pending_out[o] = now;
5196               continue;
5197             }
5198           }
5199         }
5200
5201         bool down_out = !osdmap.is_destroyed(o) &&
5202           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5203         bool destroyed_out = osdmap.is_destroyed(o) &&
5204           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5205         // this is not precise enough as we did not make a note when this osd
5206         // was marked as destroyed, but let's not bother with that
5207         // complexity for now.
5208           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5209         if (down_out || destroyed_out) {
5210           dout(10) << "tick marking osd." << o << " OUT after " << down
5211                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5212           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5213
5214           // set the AUTOOUT bit.
5215           if (pending_inc.new_state.count(o) == 0)
5216             pending_inc.new_state[o] = 0;
5217           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5218
5219           // remember previous weight
5220           if (pending_inc.new_xinfo.count(o) == 0)
5221             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5222           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5223
5224           do_propose = true;
5225
5226           mon.clog->info() << "Marking osd." << o << " out (has been down for "
5227                             << int(down.sec()) << " seconds)";
5228         } else
5229           continue;
5230       }
5231
5232       down_pending_out.erase(o);
5233     }
5234   } else {
5235     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5236   }
5237
5238   // expire blocklisted items?
5239   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5240        p != osdmap.blocklist.end();
5241        ++p) {
5242     if (p->second < now) {
5243       dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5244       pending_inc.old_blocklist.push_back(p->first);
5245       do_propose = true;
5246     }
5247   }
5248
5249   if (try_prune_purged_snaps()) {
5250     do_propose = true;
5251   }
5252
5253   if (update_pools_status())
5254     do_propose = true;
5255
5256   if (do_propose ||
5257       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5258     propose_pending();
5259 }
5260
5261 void OSDMonitor::_set_new_cache_sizes()
5262 {
5263   uint64_t cache_size = 0;
5264   int64_t inc_alloc = 0;
5265   int64_t full_alloc = 0;
5266   int64_t kv_alloc = 0;
5267
5268   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5269     cache_size = pcm->get_tuned_mem();
5270     inc_alloc = inc_cache->get_committed_size();
5271     full_alloc = full_cache->get_committed_size();
5272     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5273   }
5274
5275   inc_osd_cache.set_bytes(inc_alloc);
5276   full_osd_cache.set_bytes(full_alloc);
5277
5278   dout(1) << __func__ << " cache_size:" << cache_size
5279            << " inc_alloc: " << inc_alloc
5280            << " full_alloc: " << full_alloc
5281            << " kv_alloc: " << kv_alloc
5282            << dendl;
5283 }
5284
5285 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5286                                      std::map<int, std::pair<utime_t, int>> &last_osd_report)
5287 {
5288   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5289   if (now - mon.get_leader_since() < timeo) {
5290     // We haven't been the leader for long enough to consider OSD timeouts
5291     return false;
5292   }
5293
5294   int max_osd = osdmap.get_max_osd();
5295   bool new_down = false;
5296
5297   for (int i=0; i < max_osd; ++i) {
5298     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5299     if (!osdmap.exists(i)) {
5300       last_osd_report.erase(i); // if any
5301       continue;
5302     }
5303     if (!osdmap.is_up(i))
5304       continue;
5305     const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5306     if (t == last_osd_report.end()) {
5307       // it wasn't in the map; start the timer.
5308       last_osd_report[i].first = now;
5309       last_osd_report[i].second = 0;
5310     } else if (can_mark_down(i)) {
5311       utime_t diff = now - t->second.first;
5312       // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5313       // to allow for the osd to miss a beacon.
5314       int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5315       utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
5316       if (diff > max_timeout) {
5317         mon.clog->info() << "osd." << i << " marked down after no beacon for "
5318                           << diff << " seconds";
5319         derr << "no beacon from osd." << i << " since " << t->second.first
5320              << ", " << diff << " seconds ago.  marking down" << dendl;
5321         pending_inc.new_state[i] = CEPH_OSD_UP;
5322         new_down = true;
5323       }
5324     }
5325   }
5326   return new_down;
5327 }
5328
5329 static void dump_cpu_list(Formatter *f, const char *name,
5330                           const string& strlist)
5331 {
5332   cpu_set_t cpu_set;
5333   size_t cpu_set_size;
5334   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5335     return;
5336   }
5337   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5338   f->open_array_section(name);
5339   for (auto cpu : cpus) {
5340     f->dump_int("cpu", cpu);
5341   }
5342   f->close_section();
5343 }
5344
5345 void OSDMonitor::dump_info(Formatter *f)
5346 {
5347   f->open_object_section("osdmap");
5348   osdmap.dump(f);
5349   f->close_section();
5350
5351   f->open_array_section("osd_metadata");
5352   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5353     if (osdmap.exists(i)) {
5354       f->open_object_section("osd");
5355       f->dump_unsigned("id", i);
5356       dump_osd_metadata(i, f, NULL);
5357       f->close_section();
5358     }
5359   }
5360   f->close_section();
5361
5362   f->open_object_section("osdmap_clean_epochs");
5363   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5364
5365   f->open_object_section("last_epoch_clean");
5366   last_epoch_clean.dump(f);
5367   f->close_section();
5368
5369   f->open_array_section("osd_epochs");
5370   for (auto& osd_epoch : osd_epochs) {
5371     f->open_object_section("osd");
5372     f->dump_unsigned("id", osd_epoch.first);
5373     f->dump_unsigned("epoch", osd_epoch.second);
5374     f->close_section();
5375   }
5376   f->close_section(); // osd_epochs
5377
5378   f->close_section(); // osd_clean_epochs
5379
5380   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5381   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5382
5383   f->open_object_section("crushmap");
5384   osdmap.crush->dump(f);
5385   f->close_section();
5386
5387   if (has_osdmap_manifest) {
5388     f->open_object_section("osdmap_manifest");
5389     osdmap_manifest.dump(f);
5390     f->close_section();
5391   }
5392 }
5393
5394 namespace {
5395   enum osd_pool_get_choices {
5396     SIZE, MIN_SIZE,
5397     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5398     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5399     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5400     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5401     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5402     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5403     CACHE_TARGET_FULL_RATIO,
5404     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5405     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5406     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5407     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5408     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5409     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5410     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5411     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5412     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5413     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5414     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5415     DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5416
5417   std::set<osd_pool_get_choices>
5418     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5419                                 const std::set<osd_pool_get_choices>& second)
5420     {
5421       std::set<osd_pool_get_choices> result;
5422       std::set_difference(first.begin(), first.end(),
5423                           second.begin(), second.end(),
5424                           std::inserter(result, result.end()));
5425       return result;
5426     }
5427 }
5428
5429
5430 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5431 {
5432   op->mark_osdmon_event(__func__);
5433   auto m = op->get_req<MMonCommand>();
5434   int r = 0;
5435   bufferlist rdata;
5436   stringstream ss, ds;
5437
5438   cmdmap_t cmdmap;
5439   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5440     string rs = ss.str();
5441     mon.reply_command(op, -EINVAL, rs, get_last_committed());
5442     return true;
5443   }
5444
5445   MonSession *session = op->get_session();
5446   if (!session) {
5447     derr << __func__ << " no session" << dendl;
5448     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5449     return true;
5450   }
5451
5452   string prefix;
5453   cmd_getval(cmdmap, "prefix", prefix);
5454
5455   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5456   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5457
5458   if (prefix == "osd stat") {
5459     if (f) {
5460       f->open_object_section("osdmap");
5461       osdmap.print_summary(f.get(), ds, "", true);
5462       f->close_section();
5463       f->flush(rdata);
5464     } else {
5465       osdmap.print_summary(nullptr, ds, "", true);
5466       rdata.append(ds);
5467     }
5468   }
5469   else if (prefix == "osd dump" ||
5470            prefix == "osd tree" ||
5471            prefix == "osd tree-from" ||
5472            prefix == "osd ls" ||
5473            prefix == "osd getmap" ||
5474            prefix == "osd getcrushmap" ||
5475            prefix == "osd ls-tree" ||
5476            prefix == "osd info") {
5477
5478     epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5479     bufferlist osdmap_bl;
5480     int err = get_version_full(epoch, osdmap_bl);
5481     if (err == -ENOENT) {
5482       r = -ENOENT;
5483       ss << "there is no map for epoch " << epoch;
5484       goto reply;
5485     }
5486     ceph_assert(err == 0);
5487     ceph_assert(osdmap_bl.length());
5488
5489     OSDMap *p;
5490     if (epoch == osdmap.get_epoch()) {
5491       p = &osdmap;
5492     } else {
5493       p = new OSDMap;
5494       p->decode(osdmap_bl);
5495     }
5496
5497     auto sg = make_scope_guard([&] {
5498       if (p != &osdmap) {
5499         delete p;
5500       }
5501     });
5502
5503     if (prefix == "osd dump") {
5504       stringstream ds;
5505       if (f) {
5506         f->open_object_section("osdmap");
5507         p->dump(f.get());
5508         f->close_section();
5509         f->flush(ds);
5510       } else {
5511         p->print(ds);
5512       }
5513       rdata.append(ds);
5514       if (!f)
5515         ds << " ";
5516     } else if (prefix == "osd ls") {
5517       if (f) {
5518         f->open_array_section("osds");
5519         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5520           if (osdmap.exists(i)) {
5521             f->dump_int("osd", i);
5522           }
5523         }
5524         f->close_section();
5525         f->flush(ds);
5526       } else {
5527         bool first = true;
5528         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5529           if (osdmap.exists(i)) {
5530             if (!first)
5531               ds << "\n";
5532             first = false;
5533             ds << i;
5534           }
5535         }
5536       }
5537       rdata.append(ds);
5538     } else if (prefix == "osd info") {
5539       int64_t osd_id;
5540       bool do_single_osd = true;
5541       if (!cmd_getval(cmdmap, "id", osd_id)) {
5542         do_single_osd = false;
5543       }
5544
5545       if (do_single_osd && !osdmap.exists(osd_id)) {
5546         ss << "osd." << osd_id << " does not exist";
5547         r = -EINVAL;
5548         goto reply;
5549       }
5550
5551       if (f) {
5552         if (do_single_osd) {
5553           osdmap.dump_osd(osd_id, f.get());
5554         } else {
5555           osdmap.dump_osds(f.get());
5556         }
5557         f->flush(ds);
5558       } else {
5559         if (do_single_osd) {
5560           osdmap.print_osd(osd_id, ds);
5561         } else {
5562           osdmap.print_osds(ds);
5563         }
5564       }
5565       rdata.append(ds);
5566     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5567       string bucket;
5568       if (prefix == "osd tree-from") {
5569         cmd_getval(cmdmap, "bucket", bucket);
5570         if (!osdmap.crush->name_exists(bucket)) {
5571           ss << "bucket '" << bucket << "' does not exist";
5572           r = -ENOENT;
5573           goto reply;
5574         }
5575         int id = osdmap.crush->get_item_id(bucket);
5576         if (id >= 0) {
5577           ss << "\"" << bucket << "\" is not a bucket";
5578           r = -EINVAL;
5579           goto reply;
5580         }
5581       }
5582
5583       vector<string> states;
5584       cmd_getval(cmdmap, "states", states);
5585       unsigned filter = 0;
5586       for (auto& s : states) {
5587         if (s == "up") {
5588           filter |= OSDMap::DUMP_UP;
5589         } else if (s == "down") {
5590           filter |= OSDMap::DUMP_DOWN;
5591         } else if (s == "in") {
5592           filter |= OSDMap::DUMP_IN;
5593         } else if (s == "out") {
5594           filter |= OSDMap::DUMP_OUT;
5595         } else if (s == "destroyed") {
5596           filter |= OSDMap::DUMP_DESTROYED;
5597         } else {
5598           ss << "unrecognized state '" << s << "'";
5599           r = -EINVAL;
5600           goto reply;
5601         }
5602       }
5603       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5604           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5605         ss << "cannot specify both 'in' and 'out'";
5606         r = -EINVAL;
5607         goto reply;
5608       }
5609       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5610            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5611            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5612            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5613            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5614            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5615         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5616         r = -EINVAL;
5617         goto reply;
5618       }
5619       if (f) {
5620         f->open_object_section("tree");
5621         p->print_tree(f.get(), NULL, filter, bucket);
5622         f->close_section();
5623         f->flush(ds);
5624       } else {
5625         p->print_tree(NULL, &ds, filter, bucket);
5626       }
5627       rdata.append(ds);
5628     } else if (prefix == "osd getmap") {
5629       rdata.append(osdmap_bl);
5630       ss << "got osdmap epoch " << p->get_epoch();
5631     } else if (prefix == "osd getcrushmap") {
5632       p->crush->encode(rdata, mon.get_quorum_con_features());
5633       ss << p->get_crush_version();
5634     } else if (prefix == "osd ls-tree") {
5635       string bucket_name;
5636       cmd_getval(cmdmap, "name", bucket_name);
5637       set<int> osds;
5638       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5639       if (r == -ENOENT) {
5640         ss << "\"" << bucket_name << "\" does not exist";
5641         goto reply;
5642       } else if (r < 0) {
5643         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5644         goto reply;
5645       }
5646
5647       if (f) {
5648         f->open_array_section("osds");
5649         for (auto &i : osds) {
5650           if (osdmap.exists(i)) {
5651             f->dump_int("osd", i);
5652           }
5653         }
5654         f->close_section();
5655         f->flush(ds);
5656       } else {
5657         bool first = true;
5658         for (auto &i : osds) {
5659           if (osdmap.exists(i)) {
5660             if (!first)
5661               ds << "\n";
5662             first = false;
5663             ds << i;
5664           }
5665         }
5666       }
5667
5668       rdata.append(ds);
5669     }
5670   } else if (prefix == "osd getmaxosd") {
5671     if (f) {
5672       f->open_object_section("getmaxosd");
5673       f->dump_unsigned("epoch", osdmap.get_epoch());
5674       f->dump_int("max_osd", osdmap.get_max_osd());
5675       f->close_section();
5676       f->flush(rdata);
5677     } else {
5678       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5679       rdata.append(ds);
5680     }
5681   } else if (prefix == "osd utilization") {
5682     string out;
5683     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5684     if (f)
5685       f->flush(rdata);
5686     else
5687       rdata.append(out);
5688     r = 0;
5689     goto reply;
5690   } else if (prefix  == "osd find") {
5691     int64_t osd;
5692     if (!cmd_getval(cmdmap, "id", osd)) {
5693       ss << "unable to parse osd id value '"
5694          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5695       r = -EINVAL;
5696       goto reply;
5697     }
5698     if (!osdmap.exists(osd)) {
5699       ss << "osd." << osd << " does not exist";
5700       r = -ENOENT;
5701       goto reply;
5702     }
5703     string format;
5704     cmd_getval(cmdmap, "format", format);
5705     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5706     f->open_object_section("osd_location");
5707     f->dump_int("osd", osd);
5708     f->dump_object("addrs", osdmap.get_addrs(osd));
5709     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5710
5711     // try to identify host, pod/container name, etc.
5712     map<string,string> m;
5713     load_metadata(osd, m, nullptr);
5714     if (auto p = m.find("hostname"); p != m.end()) {
5715       f->dump_string("host", p->second);
5716     }
5717     for (auto& k : {
5718         "pod_name", "pod_namespace", // set by rook
5719         "container_name"             // set by cephadm, ceph-ansible
5720         }) {
5721       if (auto p = m.find(k); p != m.end()) {
5722         f->dump_string(k, p->second);
5723       }
5724     }
5725
5726     // crush is helpful too
5727     f->open_object_section("crush_location");
5728     map<string,string> loc = osdmap.crush->get_full_location(osd);
5729     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5730       f->dump_string(p->first.c_str(), p->second);
5731     f->close_section();
5732     f->close_section();
5733     f->flush(rdata);
5734   } else if (prefix == "osd metadata") {
5735     int64_t osd = -1;
5736     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5737         !cmd_getval(cmdmap, "id", osd)) {
5738       ss << "unable to parse osd id value '"
5739          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5740       r = -EINVAL;
5741       goto reply;
5742     }
5743     if (osd >= 0 && !osdmap.exists(osd)) {
5744       ss << "osd." << osd << " does not exist";
5745       r = -ENOENT;
5746       goto reply;
5747     }
5748     string format;
5749     cmd_getval(cmdmap, "format", format);
5750     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5751     if (osd >= 0) {
5752       f->open_object_section("osd_metadata");
5753       f->dump_unsigned("id", osd);
5754       r = dump_osd_metadata(osd, f.get(), &ss);
5755       if (r < 0)
5756         goto reply;
5757       f->close_section();
5758     } else {
5759       r = 0;
5760       f->open_array_section("osd_metadata");
5761       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5762         if (osdmap.exists(i)) {
5763           f->open_object_section("osd");
5764           f->dump_unsigned("id", i);
5765           r = dump_osd_metadata(i, f.get(), NULL);
5766           if (r == -EINVAL || r == -ENOENT) {
5767             // Drop error, continue to get other daemons' metadata
5768             dout(4) << "No metadata for osd." << i << dendl;
5769             r = 0;
5770           } else if (r < 0) {
5771             // Unexpected error
5772             goto reply;
5773           }
5774           f->close_section();
5775         }
5776       }
5777       f->close_section();
5778     }
5779     f->flush(rdata);
5780   } else if (prefix == "osd versions") {
5781     if (!f)
5782       f.reset(Formatter::create("json-pretty"));
5783     count_metadata("ceph_version", f.get());
5784     f->flush(rdata);
5785     r = 0;
5786   } else if (prefix == "osd count-metadata") {
5787     if (!f)
5788       f.reset(Formatter::create("json-pretty"));
5789     string field;
5790     cmd_getval(cmdmap, "property", field);
5791     count_metadata(field, f.get());
5792     f->flush(rdata);
5793     r = 0;
5794   } else if (prefix == "osd numa-status") {
5795     TextTable tbl;
5796     if (f) {
5797       f->open_array_section("osds");
5798     } else {
5799       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5800       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5801       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5802       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5803       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5804       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5805     }
5806     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5807       if (osdmap.exists(i)) {
5808         map<string,string> m;
5809         ostringstream err;
5810         if (load_metadata(i, m, &err) < 0) {
5811           continue;
5812         }
5813         string host;
5814         auto p = m.find("hostname");
5815         if (p != m.end()) {
5816           host = p->second;
5817         }
5818         if (f) {
5819           f->open_object_section("osd");
5820           f->dump_int("osd", i);
5821           f->dump_string("host", host);
5822           for (auto n : { "network_numa_node", "objectstore_numa_node",
5823                 "numa_node" }) {
5824             p = m.find(n);
5825             if (p != m.end()) {
5826               f->dump_int(n, atoi(p->second.c_str()));
5827             }
5828           }
5829           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5830             p = m.find(n);
5831             if (p != m.end()) {
5832               list<string> ls = get_str_list(p->second, ",");
5833               f->open_array_section(n);
5834               for (auto node : ls) {
5835                 f->dump_int("node", atoi(node.c_str()));
5836               }
5837               f->close_section();
5838             }
5839           }
5840           for (auto n : { "numa_node_cpus" }) {
5841             p = m.find(n);
5842             if (p != m.end()) {
5843               dump_cpu_list(f.get(), n, p->second);
5844             }
5845           }
5846           f->close_section();
5847         } else {
5848           tbl << i;
5849           tbl << host;
5850           p = m.find("network_numa_nodes");
5851           if (p != m.end()) {
5852             tbl << p->second;
5853           } else {
5854             tbl << "-";
5855           }
5856           p = m.find("objectstore_numa_nodes");
5857           if (p != m.end()) {
5858             tbl << p->second;
5859           } else {
5860             tbl << "-";
5861           }
5862           p = m.find("numa_node");
5863           auto q = m.find("numa_node_cpus");
5864           if (p != m.end() && q != m.end()) {
5865             tbl << p->second;
5866             tbl << q->second;
5867           } else {
5868             tbl << "-";
5869             tbl << "-";
5870           }
5871           tbl << TextTable::endrow;
5872         }
5873       }
5874     }
5875     if (f) {
5876       f->close_section();
5877       f->flush(rdata);
5878     } else {
5879       rdata.append(stringify(tbl));
5880     }
5881   } else if (prefix == "osd map") {
5882     string poolstr, objstr, namespacestr;
5883     cmd_getval(cmdmap, "pool", poolstr);
5884     cmd_getval(cmdmap, "object", objstr);
5885     cmd_getval(cmdmap, "nspace", namespacestr);
5886
5887     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5888     if (pool < 0) {
5889       ss << "pool " << poolstr << " does not exist";
5890       r = -ENOENT;
5891       goto reply;
5892     }
5893     object_locator_t oloc(pool, namespacestr);
5894     object_t oid(objstr);
5895     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5896     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5897     vector<int> up, acting;
5898     int up_p, acting_p;
5899     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5900
5901     string fullobjname;
5902     if (!namespacestr.empty())
5903       fullobjname = namespacestr + string("/") + oid.name;
5904     else
5905       fullobjname = oid.name;
5906     if (f) {
5907       f->open_object_section("osd_map");
5908       f->dump_unsigned("epoch", osdmap.get_epoch());
5909       f->dump_string("pool", poolstr);
5910       f->dump_int("pool_id", pool);
5911       f->dump_stream("objname") << fullobjname;
5912       f->dump_stream("raw_pgid") << pgid;
5913       f->dump_stream("pgid") << mpgid;
5914       f->open_array_section("up");
5915       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5916         f->dump_int("osd", *p);
5917       f->close_section();
5918       f->dump_int("up_primary", up_p);
5919       f->open_array_section("acting");
5920       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5921         f->dump_int("osd", *p);
5922       f->close_section();
5923       f->dump_int("acting_primary", acting_p);
5924       f->close_section(); // osd_map
5925       f->flush(rdata);
5926     } else {
5927       ds << "osdmap e" << osdmap.get_epoch()
5928         << " pool '" << poolstr << "' (" << pool << ")"
5929         << " object '" << fullobjname << "' ->"
5930         << " pg " << pgid << " (" << mpgid << ")"
5931         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5932         << pg_vector_string(acting) << ", p" << acting_p << ")";
5933       rdata.append(ds);
5934     }
5935
5936   } else if (prefix == "pg map") {
5937     pg_t pgid;
5938     string pgidstr;
5939     cmd_getval(cmdmap, "pgid", pgidstr);
5940     if (!pgid.parse(pgidstr.c_str())) {
5941       ss << "invalid pgid '" << pgidstr << "'";
5942       r = -EINVAL;
5943       goto reply;
5944     }
5945     vector<int> up, acting;
5946     if (!osdmap.have_pg_pool(pgid.pool())) {
5947       ss << "pg '" << pgidstr << "' does not exist";
5948       r = -ENOENT;
5949       goto reply;
5950     }
5951     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5952     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5953     if (f) {
5954       f->open_object_section("pg_map");
5955       f->dump_unsigned("epoch", osdmap.get_epoch());
5956       f->dump_stream("raw_pgid") << pgid;
5957       f->dump_stream("pgid") << mpgid;
5958       f->open_array_section("up");
5959       for (auto osd : up) {
5960         f->dump_int("up_osd", osd);
5961       }
5962       f->close_section();
5963       f->open_array_section("acting");
5964       for (auto osd : acting) {
5965         f->dump_int("acting_osd", osd);
5966       }
5967       f->close_section();
5968       f->close_section();
5969       f->flush(rdata);
5970     } else {
5971       ds << "osdmap e" << osdmap.get_epoch()
5972          << " pg " << pgid << " (" << mpgid << ")"
5973          << " -> up " << up << " acting " << acting;
5974       rdata.append(ds);
5975     }
5976     goto reply;
5977
5978   } else if (prefix == "osd lspools") {
5979     if (f)
5980       f->open_array_section("pools");
5981     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5982          p != osdmap.pools.end();
5983          ++p) {
5984       if (f) {
5985         f->open_object_section("pool");
5986         f->dump_int("poolnum", p->first);
5987         f->dump_string("poolname", osdmap.pool_name[p->first]);
5988         f->close_section();
5989       } else {
5990         ds << p->first << ' ' << osdmap.pool_name[p->first];
5991         if (next(p) != osdmap.pools.end()) {
5992           ds << '\n';
5993         }
5994       }
5995     }
5996     if (f) {
5997       f->close_section();
5998       f->flush(ds);
5999     }
6000     rdata.append(ds);
6001   } else if (prefix == "osd blocklist ls" ||
6002              prefix == "osd blacklist ls") {
6003     if (f)
6004       f->open_array_section("blocklist");
6005
6006     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6007          p != osdmap.blocklist.end();
6008          ++p) {
6009       if (f) {
6010         f->open_object_section("entry");
6011         f->dump_string("addr", p->first.get_legacy_str());
6012         f->dump_stream("until") << p->second;
6013         f->close_section();
6014       } else {
6015         stringstream ss;
6016         string s;
6017         ss << p->first << " " << p->second;
6018         getline(ss, s);
6019         s += "\n";
6020         rdata.append(s);
6021       }
6022     }
6023     if (f) {
6024       f->close_section();
6025       f->flush(rdata);
6026     }
6027     ss << "listed " << osdmap.blocklist.size() << " entries";
6028
6029   } else if (prefix == "osd pool ls") {
6030     string detail;
6031     cmd_getval(cmdmap, "detail", detail);
6032     if (!f && detail == "detail") {
6033       ostringstream ss;
6034       osdmap.print_pools(ss);
6035       rdata.append(ss.str());
6036     } else {
6037       if (f)
6038         f->open_array_section("pools");
6039       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6040            it != osdmap.get_pools().end();
6041            ++it) {
6042         if (f) {
6043           if (detail == "detail") {
6044             f->open_object_section("pool");
6045             f->dump_int("pool_id", it->first);
6046             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6047             it->second.dump(f.get());
6048             f->close_section();
6049           } else {
6050             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6051           }
6052         } else {
6053           rdata.append(osdmap.get_pool_name(it->first) + "\n");
6054         }
6055       }
6056       if (f) {
6057         f->close_section();
6058         f->flush(rdata);
6059       }
6060     }
6061
6062   } else if (prefix == "osd crush get-tunable") {
6063     string tunable;
6064     cmd_getval(cmdmap, "tunable", tunable);
6065     ostringstream rss;
6066     if (f)
6067       f->open_object_section("tunable");
6068     if (tunable == "straw_calc_version") {
6069       if (f)
6070         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6071       else
6072         rss << osdmap.crush->get_straw_calc_version() << "\n";
6073     } else {
6074       r = -EINVAL;
6075       goto reply;
6076     }
6077     if (f) {
6078       f->close_section();
6079       f->flush(rdata);
6080     } else {
6081       rdata.append(rss.str());
6082     }
6083     r = 0;
6084
6085   } else if (prefix == "osd pool get") {
6086     string poolstr;
6087     cmd_getval(cmdmap, "pool", poolstr);
6088     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6089     if (pool < 0) {
6090       ss << "unrecognized pool '" << poolstr << "'";
6091       r = -ENOENT;
6092       goto reply;
6093     }
6094
6095     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6096     string var;
6097     cmd_getval(cmdmap, "var", var);
6098
6099     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6100     const choices_map_t ALL_CHOICES = {
6101       {"size", SIZE},
6102       {"min_size", MIN_SIZE},
6103       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6104       {"crush_rule", CRUSH_RULE},
6105       {"hashpspool", HASHPSPOOL},
6106       {"eio", POOL_EIO},
6107       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6108       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6109       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6110       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6111       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6112       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6113       {"use_gmt_hitset", USE_GMT_HITSET},
6114       {"target_max_objects", TARGET_MAX_OBJECTS},
6115       {"target_max_bytes", TARGET_MAX_BYTES},
6116       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6117       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6118       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6119       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6120       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6121       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6122       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6123       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6124       {"fast_read", FAST_READ},
6125       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6126       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6127       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6128       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6129       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6130       {"recovery_priority", RECOVERY_PRIORITY},
6131       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6132       {"scrub_priority", SCRUB_PRIORITY},
6133       {"compression_mode", COMPRESSION_MODE},
6134       {"compression_algorithm", COMPRESSION_ALGORITHM},
6135       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6136       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6137       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6138       {"csum_type", CSUM_TYPE},
6139       {"csum_max_block", CSUM_MAX_BLOCK},
6140       {"csum_min_block", CSUM_MIN_BLOCK},
6141       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6142       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6143       {"pg_num_min", PG_NUM_MIN},
6144       {"pg_num_max", PG_NUM_MAX},
6145       {"target_size_bytes", TARGET_SIZE_BYTES},
6146       {"target_size_ratio", TARGET_SIZE_RATIO},
6147       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6148       {"dedup_tier", DEDUP_TIER},
6149       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6150       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6151       {"bulk", BULK}
6152     };
6153
6154     typedef std::set<osd_pool_get_choices> choices_set_t;
6155
6156     const choices_set_t ONLY_TIER_CHOICES = {
6157       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6158       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6159       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6160       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6161       MIN_READ_RECENCY_FOR_PROMOTE,
6162       MIN_WRITE_RECENCY_FOR_PROMOTE,
6163       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6164     };
6165     const choices_set_t ONLY_ERASURE_CHOICES = {
6166       EC_OVERWRITES, ERASURE_CODE_PROFILE
6167     };
6168
6169     choices_set_t selected_choices;
6170     if (var == "all") {
6171       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6172           it != ALL_CHOICES.end(); ++it) {
6173         selected_choices.insert(it->second);
6174       }
6175
6176       if(!p->is_tier()) {
6177         selected_choices = subtract_second_from_first(selected_choices,
6178                                                       ONLY_TIER_CHOICES);
6179       }
6180
6181       if(!p->is_erasure()) {
6182         selected_choices = subtract_second_from_first(selected_choices,
6183                                                       ONLY_ERASURE_CHOICES);
6184       }
6185     } else /* var != "all" */  {
6186       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6187       if (found == ALL_CHOICES.end()) {
6188         ss << "pool '" << poolstr
6189                << "': invalid variable: '" << var << "'";
6190         r = -EINVAL;
6191         goto reply;
6192       }
6193
6194       osd_pool_get_choices selected = found->second;
6195
6196       if (!p->is_tier() &&
6197           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6198         ss << "pool '" << poolstr
6199            << "' is not a tier pool: variable not applicable";
6200         r = -EACCES;
6201         goto reply;
6202       }
6203
6204       if (!p->is_erasure() &&
6205           ONLY_ERASURE_CHOICES.find(selected)
6206           != ONLY_ERASURE_CHOICES.end()) {
6207         ss << "pool '" << poolstr
6208            << "' is not a erasure pool: variable not applicable";
6209         r = -EACCES;
6210         goto reply;
6211       }
6212
6213       if (pool_opts_t::is_opt_name(var) &&
6214           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6215         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6216         r = -ENOENT;
6217         goto reply;
6218       }
6219
6220       selected_choices.insert(selected);
6221     }
6222
6223     if (f) {
6224       f->open_object_section("pool");
6225       f->dump_string("pool", poolstr);
6226       f->dump_int("pool_id", pool);
6227       for(choices_set_t::const_iterator it = selected_choices.begin();
6228           it != selected_choices.end(); ++it) {
6229         choices_map_t::const_iterator i;
6230         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6231           if (i->second == *it) {
6232             break;
6233           }
6234         }
6235         ceph_assert(i != ALL_CHOICES.end());
6236         switch(*it) {
6237           case PG_NUM:
6238             f->dump_int("pg_num", p->get_pg_num());
6239             break;
6240           case PGP_NUM:
6241             f->dump_int("pgp_num", p->get_pgp_num());
6242             break;
6243           case SIZE:
6244             f->dump_int("size", p->get_size());
6245             break;
6246           case MIN_SIZE:
6247             f->dump_int("min_size", p->get_min_size());
6248             break;
6249           case CRUSH_RULE:
6250             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6251               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6252                                p->get_crush_rule()));
6253             } else {
6254               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6255             }
6256             break;
6257           case EC_OVERWRITES:
6258             f->dump_bool("allow_ec_overwrites",
6259                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6260             break;
6261           case PG_AUTOSCALE_MODE:
6262             f->dump_string("pg_autoscale_mode",
6263                            pg_pool_t::get_pg_autoscale_mode_name(
6264                              p->pg_autoscale_mode));
6265             break;
6266           case HASHPSPOOL:
6267           case POOL_EIO:
6268           case NODELETE:
6269           case BULK:
6270           case NOPGCHANGE:
6271           case NOSIZECHANGE:
6272           case WRITE_FADVISE_DONTNEED:
6273           case NOSCRUB:
6274           case NODEEP_SCRUB:
6275             f->dump_bool(i->first.c_str(),
6276                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6277             break;
6278           case HIT_SET_PERIOD:
6279             f->dump_int("hit_set_period", p->hit_set_period);
6280             break;
6281           case HIT_SET_COUNT:
6282             f->dump_int("hit_set_count", p->hit_set_count);
6283             break;
6284           case HIT_SET_TYPE:
6285             f->dump_string("hit_set_type",
6286                            HitSet::get_type_name(p->hit_set_params.get_type()));
6287             break;
6288           case HIT_SET_FPP:
6289             {
6290               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6291                 BloomHitSet::Params *bloomp =
6292                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6293                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6294               } else if(var != "all") {
6295                 f->close_section();
6296                 ss << "hit set is not of type Bloom; " <<
6297                   "invalid to get a false positive rate!";
6298                 r = -EINVAL;
6299                 goto reply;
6300               }
6301             }
6302             break;
6303           case USE_GMT_HITSET:
6304             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6305             break;
6306           case TARGET_MAX_OBJECTS:
6307             f->dump_unsigned("target_max_objects", p->target_max_objects);
6308             break;
6309           case TARGET_MAX_BYTES:
6310             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6311             break;
6312           case CACHE_TARGET_DIRTY_RATIO:
6313             f->dump_unsigned("cache_target_dirty_ratio_micro",
6314                              p->cache_target_dirty_ratio_micro);
6315             f->dump_float("cache_target_dirty_ratio",
6316                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6317             break;
6318           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6319             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6320                              p->cache_target_dirty_high_ratio_micro);
6321             f->dump_float("cache_target_dirty_high_ratio",
6322                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6323             break;
6324           case CACHE_TARGET_FULL_RATIO:
6325             f->dump_unsigned("cache_target_full_ratio_micro",
6326                              p->cache_target_full_ratio_micro);
6327             f->dump_float("cache_target_full_ratio",
6328                           ((float)p->cache_target_full_ratio_micro/1000000));
6329             break;
6330           case CACHE_MIN_FLUSH_AGE:
6331             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6332             break;
6333           case CACHE_MIN_EVICT_AGE:
6334             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6335             break;
6336           case ERASURE_CODE_PROFILE:
6337             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6338             break;
6339           case MIN_READ_RECENCY_FOR_PROMOTE:
6340             f->dump_int("min_read_recency_for_promote",
6341                         p->min_read_recency_for_promote);
6342             break;
6343           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6344             f->dump_int("min_write_recency_for_promote",
6345                         p->min_write_recency_for_promote);
6346             break;
6347           case FAST_READ:
6348             f->dump_int("fast_read", p->fast_read);
6349             break;
6350           case HIT_SET_GRADE_DECAY_RATE:
6351             f->dump_int("hit_set_grade_decay_rate",
6352                         p->hit_set_grade_decay_rate);
6353             break;
6354           case HIT_SET_SEARCH_LAST_N:
6355             f->dump_int("hit_set_search_last_n",
6356                         p->hit_set_search_last_n);
6357             break;
6358           case SCRUB_MIN_INTERVAL:
6359           case SCRUB_MAX_INTERVAL:
6360           case DEEP_SCRUB_INTERVAL:
6361           case RECOVERY_PRIORITY:
6362           case RECOVERY_OP_PRIORITY:
6363           case SCRUB_PRIORITY:
6364           case COMPRESSION_MODE:
6365           case COMPRESSION_ALGORITHM:
6366           case COMPRESSION_REQUIRED_RATIO:
6367           case COMPRESSION_MAX_BLOB_SIZE:
6368           case COMPRESSION_MIN_BLOB_SIZE:
6369           case CSUM_TYPE:
6370           case CSUM_MAX_BLOCK:
6371           case CSUM_MIN_BLOCK:
6372           case FINGERPRINT_ALGORITHM:
6373           case PG_NUM_MIN:
6374           case PG_NUM_MAX:
6375           case TARGET_SIZE_BYTES:
6376           case TARGET_SIZE_RATIO:
6377           case PG_AUTOSCALE_BIAS:
6378           case DEDUP_TIER:
6379           case DEDUP_CHUNK_ALGORITHM:
6380           case DEDUP_CDC_CHUNK_SIZE:
6381             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6382             if (p->opts.is_set(key)) {
6383               if(*it == CSUM_TYPE) {
6384                 int64_t val;
6385                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6386                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6387               } else {
6388                 p->opts.dump(i->first, f.get());
6389               }
6390             }
6391             break;
6392         }
6393       }
6394       f->close_section();
6395       f->flush(rdata);
6396     } else /* !f */ {
6397       for(choices_set_t::const_iterator it = selected_choices.begin();
6398           it != selected_choices.end(); ++it) {
6399         choices_map_t::const_iterator i;
6400         switch(*it) {
6401           case PG_NUM:
6402             ss << "pg_num: " << p->get_pg_num() << "\n";
6403             break;
6404           case PGP_NUM:
6405             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6406             break;
6407           case SIZE:
6408             ss << "size: " << p->get_size() << "\n";
6409             break;
6410           case MIN_SIZE:
6411             ss << "min_size: " << p->get_min_size() << "\n";
6412             break;
6413           case CRUSH_RULE:
6414             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6415               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6416                 p->get_crush_rule()) << "\n";
6417             } else {
6418               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6419             }
6420             break;
6421           case PG_AUTOSCALE_MODE:
6422             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6423               p->pg_autoscale_mode) <<"\n";
6424             break;
6425           case HIT_SET_PERIOD:
6426             ss << "hit_set_period: " << p->hit_set_period << "\n";
6427             break;
6428           case HIT_SET_COUNT:
6429             ss << "hit_set_count: " << p->hit_set_count << "\n";
6430             break;
6431           case HIT_SET_TYPE:
6432             ss << "hit_set_type: " <<
6433               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6434             break;
6435           case HIT_SET_FPP:
6436             {
6437               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6438                 BloomHitSet::Params *bloomp =
6439                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6440                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6441               } else if(var != "all") {
6442                 ss << "hit set is not of type Bloom; " <<
6443                   "invalid to get a false positive rate!";
6444                 r = -EINVAL;
6445                 goto reply;
6446               }
6447             }
6448             break;
6449           case USE_GMT_HITSET:
6450             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6451             break;
6452           case TARGET_MAX_OBJECTS:
6453             ss << "target_max_objects: " << p->target_max_objects << "\n";
6454             break;
6455           case TARGET_MAX_BYTES:
6456             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6457             break;
6458           case CACHE_TARGET_DIRTY_RATIO:
6459             ss << "cache_target_dirty_ratio: "
6460                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6461             break;
6462           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6463             ss << "cache_target_dirty_high_ratio: "
6464                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6465             break;
6466           case CACHE_TARGET_FULL_RATIO:
6467             ss << "cache_target_full_ratio: "
6468                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6469             break;
6470           case CACHE_MIN_FLUSH_AGE:
6471             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6472             break;
6473           case CACHE_MIN_EVICT_AGE:
6474             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6475             break;
6476           case ERASURE_CODE_PROFILE:
6477             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6478             break;
6479           case MIN_READ_RECENCY_FOR_PROMOTE:
6480             ss << "min_read_recency_for_promote: " <<
6481               p->min_read_recency_for_promote << "\n";
6482             break;
6483           case HIT_SET_GRADE_DECAY_RATE:
6484             ss << "hit_set_grade_decay_rate: " <<
6485               p->hit_set_grade_decay_rate << "\n";
6486             break;
6487           case HIT_SET_SEARCH_LAST_N:
6488             ss << "hit_set_search_last_n: " <<
6489               p->hit_set_search_last_n << "\n";
6490             break;
6491           case EC_OVERWRITES:
6492             ss << "allow_ec_overwrites: " <<
6493               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6494               "\n";
6495             break;
6496           case HASHPSPOOL:
6497           case POOL_EIO:
6498           case NODELETE:
6499           case BULK:
6500           case NOPGCHANGE:
6501           case NOSIZECHANGE:
6502           case WRITE_FADVISE_DONTNEED:
6503           case NOSCRUB:
6504           case NODEEP_SCRUB:
6505             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6506               if (i->second == *it)
6507                 break;
6508             }
6509             ceph_assert(i != ALL_CHOICES.end());
6510             ss << i->first << ": " <<
6511               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6512                "true" : "false") << "\n";
6513             break;
6514           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6515             ss << "min_write_recency_for_promote: " <<
6516               p->min_write_recency_for_promote << "\n";
6517             break;
6518           case FAST_READ:
6519             ss << "fast_read: " << p->fast_read << "\n";
6520             break;
6521           case SCRUB_MIN_INTERVAL:
6522           case SCRUB_MAX_INTERVAL:
6523           case DEEP_SCRUB_INTERVAL:
6524           case RECOVERY_PRIORITY:
6525           case RECOVERY_OP_PRIORITY:
6526           case SCRUB_PRIORITY:
6527           case COMPRESSION_MODE:
6528           case COMPRESSION_ALGORITHM:
6529           case COMPRESSION_REQUIRED_RATIO:
6530           case COMPRESSION_MAX_BLOB_SIZE:
6531           case COMPRESSION_MIN_BLOB_SIZE:
6532           case CSUM_TYPE:
6533           case CSUM_MAX_BLOCK:
6534           case CSUM_MIN_BLOCK:
6535           case FINGERPRINT_ALGORITHM:
6536           case PG_NUM_MIN:
6537           case PG_NUM_MAX:
6538           case TARGET_SIZE_BYTES:
6539           case TARGET_SIZE_RATIO:
6540           case PG_AUTOSCALE_BIAS:
6541           case DEDUP_TIER:
6542           case DEDUP_CHUNK_ALGORITHM:
6543           case DEDUP_CDC_CHUNK_SIZE:
6544             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6545               if (i->second == *it)
6546                 break;
6547             }
6548             ceph_assert(i != ALL_CHOICES.end());
6549             {
6550               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6551               if (p->opts.is_set(key)) {
6552                 if(key == pool_opts_t::CSUM_TYPE) {
6553                   int64_t val;
6554                   p->opts.get(key, &val);
6555                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6556                 } else {
6557                   ss << i->first << ": " << p->opts.get(key) << "\n";
6558                 }
6559               }
6560             }
6561             break;
6562         }
6563         rdata.append(ss.str());
6564         ss.str("");
6565       }
6566     }
6567     r = 0;
6568   } else if (prefix == "osd pool get-quota") {
6569     string pool_name;
6570     cmd_getval(cmdmap, "pool", pool_name);
6571
6572     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6573     if (poolid < 0) {
6574       ceph_assert(poolid == -ENOENT);
6575       ss << "unrecognized pool '" << pool_name << "'";
6576       r = -ENOENT;
6577       goto reply;
6578     }
6579     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6580     const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6581     if (!pstat) {
6582       ss << "no stats for pool '" << pool_name << "'";
6583       r = -ENOENT;
6584       goto reply;
6585     }
6586     const object_stat_sum_t& sum = pstat->stats.sum;
6587     if (f) {
6588       f->open_object_section("pool_quotas");
6589       f->dump_string("pool_name", pool_name);
6590       f->dump_unsigned("pool_id", poolid);
6591       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6592       f->dump_int("current_num_objects", sum.num_objects);
6593       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6594       f->dump_int("current_num_bytes", sum.num_bytes);
6595       f->close_section();
6596       f->flush(rdata);
6597     } else {
6598       stringstream rs;
6599       rs << "quotas for pool '" << pool_name << "':\n"
6600          << "  max objects: ";
6601       if (p->quota_max_objects == 0)
6602         rs << "N/A";
6603       else {
6604         rs << si_u_t(p->quota_max_objects) << " objects";
6605         rs << "  (current num objects: " << sum.num_objects << " objects)";
6606       }
6607       rs << "\n"
6608          << "  max bytes  : ";
6609       if (p->quota_max_bytes == 0)
6610         rs << "N/A";
6611       else {
6612         rs << byte_u_t(p->quota_max_bytes);
6613         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6614       }
6615       rdata.append(rs.str());
6616     }
6617     rdata.append("\n");
6618     r = 0;
6619   } else if (prefix == "osd crush rule list" ||
6620              prefix == "osd crush rule ls") {
6621     if (f) {
6622       f->open_array_section("rules");
6623       osdmap.crush->list_rules(f.get());
6624       f->close_section();
6625       f->flush(rdata);
6626     } else {
6627       ostringstream ss;
6628       osdmap.crush->list_rules(&ss);
6629       rdata.append(ss.str());
6630     }
6631   } else if (prefix == "osd crush rule ls-by-class") {
6632     string class_name;
6633     cmd_getval(cmdmap, "class", class_name);
6634     if (class_name.empty()) {
6635       ss << "no class specified";
6636       r = -EINVAL;
6637       goto reply;
6638     }
6639     set<int> rules;
6640     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6641     if (r < 0) {
6642       ss << "failed to get rules by class '" << class_name << "'";
6643       goto reply;
6644     }
6645     if (f) {
6646       f->open_array_section("rules");
6647       for (auto &rule: rules) {
6648         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6649       }
6650       f->close_section();
6651       f->flush(rdata);
6652     } else {
6653       ostringstream rs;
6654       for (auto &rule: rules) {
6655         rs << osdmap.crush->get_rule_name(rule) << "\n";
6656       }
6657       rdata.append(rs.str());
6658     }
6659   } else if (prefix == "osd crush rule dump") {
6660     string name;
6661     cmd_getval(cmdmap, "name", name);
6662     string format;
6663     cmd_getval(cmdmap, "format", format);
6664     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6665     if (name == "") {
6666       f->open_array_section("rules");
6667       osdmap.crush->dump_rules(f.get());
6668       f->close_section();
6669     } else {
6670       int ruleno = osdmap.crush->get_rule_id(name);
6671       if (ruleno < 0) {
6672         ss << "unknown crush rule '" << name << "'";
6673         r = ruleno;
6674         goto reply;
6675       }
6676       osdmap.crush->dump_rule(ruleno, f.get());
6677     }
6678     ostringstream rs;
6679     f->flush(rs);
6680     rs << "\n";
6681     rdata.append(rs.str());
6682   } else if (prefix == "osd crush dump") {
6683     string format;
6684     cmd_getval(cmdmap, "format", format);
6685     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6686     f->open_object_section("crush_map");
6687     osdmap.crush->dump(f.get());
6688     f->close_section();
6689     ostringstream rs;
6690     f->flush(rs);
6691     rs << "\n";
6692     rdata.append(rs.str());
6693   } else if (prefix == "osd crush show-tunables") {
6694     string format;
6695     cmd_getval(cmdmap, "format", format);
6696     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6697     f->open_object_section("crush_map_tunables");
6698     osdmap.crush->dump_tunables(f.get());
6699     f->close_section();
6700     ostringstream rs;
6701     f->flush(rs);
6702     rs << "\n";
6703     rdata.append(rs.str());
6704   } else if (prefix == "osd crush tree") {
6705     bool show_shadow = false;
6706     if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6707       std::string shadow;
6708       if (cmd_getval(cmdmap, "shadow", shadow) &&
6709           shadow == "--show-shadow") {
6710         show_shadow = true;
6711       }
6712     }
6713     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6714     if (f) {
6715       f->open_object_section("crush_tree");
6716       osdmap.crush->dump_tree(nullptr,
6717                               f.get(),
6718                               osdmap.get_pool_names(),
6719                               show_shadow);
6720       f->close_section();
6721       f->flush(rdata);
6722     } else {
6723       ostringstream ss;
6724       osdmap.crush->dump_tree(&ss,
6725                               nullptr,
6726                               osdmap.get_pool_names(),
6727                               show_shadow);
6728       rdata.append(ss.str());
6729     }
6730   } else if (prefix == "osd crush ls") {
6731     string name;
6732     if (!cmd_getval(cmdmap, "node", name)) {
6733       ss << "no node specified";
6734       r = -EINVAL;
6735       goto reply;
6736     }
6737     if (!osdmap.crush->name_exists(name)) {
6738       ss << "node '" << name << "' does not exist";
6739       r = -ENOENT;
6740       goto reply;
6741     }
6742     int id = osdmap.crush->get_item_id(name);
6743     list<int> result;
6744     if (id >= 0) {
6745       result.push_back(id);
6746     } else {
6747       int num = osdmap.crush->get_bucket_size(id);
6748       for (int i = 0; i < num; ++i) {
6749         result.push_back(osdmap.crush->get_bucket_item(id, i));
6750       }
6751     }
6752     if (f) {
6753       f->open_array_section("items");
6754       for (auto i : result) {
6755         f->dump_string("item", osdmap.crush->get_item_name(i));
6756       }
6757       f->close_section();
6758       f->flush(rdata);
6759     } else {
6760       ostringstream ss;
6761       for (auto i : result) {
6762         ss << osdmap.crush->get_item_name(i) << "\n";
6763       }
6764       rdata.append(ss.str());
6765     }
6766     r = 0;
6767   } else if (prefix == "osd crush class ls") {
6768     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6769     f->open_array_section("crush_classes");
6770     for (auto i : osdmap.crush->class_name)
6771       f->dump_string("class", i.second);
6772     f->close_section();
6773     f->flush(rdata);
6774   } else if (prefix == "osd crush class ls-osd") {
6775     string name;
6776     cmd_getval(cmdmap, "class", name);
6777     set<int> osds;
6778     osdmap.crush->get_devices_by_class(name, &osds);
6779     if (f) {
6780       f->open_array_section("osds");
6781       for (auto &osd: osds)
6782         f->dump_int("osd", osd);
6783       f->close_section();
6784       f->flush(rdata);
6785     } else {
6786       bool first = true;
6787       for (auto &osd : osds) {
6788         if (!first)
6789           ds << "\n";
6790         first = false;
6791         ds << osd;
6792       }
6793       rdata.append(ds);
6794     }
6795   } else if (prefix == "osd crush get-device-class") {
6796     vector<string> idvec;
6797     cmd_getval(cmdmap, "ids", idvec);
6798     map<int, string> class_by_osd;
6799     for (auto& id : idvec) {
6800       ostringstream ts;
6801       long osd = parse_osd_id(id.c_str(), &ts);
6802       if (osd < 0) {
6803         ss << "unable to parse osd id:'" << id << "'";
6804         r = -EINVAL;
6805         goto reply;
6806       }
6807       auto device_class = osdmap.crush->get_item_class(osd);
6808       if (device_class)
6809         class_by_osd[osd] = device_class;
6810       else
6811         class_by_osd[osd] = ""; // no class
6812     }
6813     if (f) {
6814       f->open_array_section("osd_device_classes");
6815       for (auto& i : class_by_osd) {
6816         f->open_object_section("osd_device_class");
6817         f->dump_int("osd", i.first);
6818         f->dump_string("device_class", i.second);
6819         f->close_section();
6820       }
6821       f->close_section();
6822       f->flush(rdata);
6823     } else {
6824       if (class_by_osd.size() == 1) {
6825         // for single input, make a clean output
6826         ds << class_by_osd.begin()->second;
6827       } else {
6828         // note that we do not group osds by class here
6829         for (auto it = class_by_osd.begin();
6830              it != class_by_osd.end();
6831              it++) {
6832           ds << "osd." << it->first << ' ' << it->second;
6833           if (next(it) != class_by_osd.end())
6834             ds << '\n';
6835         }
6836       }
6837       rdata.append(ds);
6838     }
6839   } else if (prefix == "osd erasure-code-profile ls") {
6840     const auto &profiles = osdmap.get_erasure_code_profiles();
6841     if (f)
6842       f->open_array_section("erasure-code-profiles");
6843     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6844       if (f)
6845         f->dump_string("profile", i->first.c_str());
6846       else
6847         rdata.append(i->first + "\n");
6848     }
6849     if (f) {
6850       f->close_section();
6851       ostringstream rs;
6852       f->flush(rs);
6853       rs << "\n";
6854       rdata.append(rs.str());
6855     }
6856   } else if (prefix == "osd crush weight-set ls") {
6857     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6858     if (f) {
6859       f->open_array_section("weight_sets");
6860       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6861         f->dump_string("pool", "(compat)");
6862       }
6863       for (auto& i : osdmap.crush->choose_args) {
6864         if (i.first >= 0) {
6865           f->dump_string("pool", osdmap.get_pool_name(i.first));
6866         }
6867       }
6868       f->close_section();
6869       f->flush(rdata);
6870     } else {
6871       ostringstream rs;
6872       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6873         rs << "(compat)\n";
6874       }
6875       for (auto& i : osdmap.crush->choose_args) {
6876         if (i.first >= 0) {
6877           rs << osdmap.get_pool_name(i.first) << "\n";
6878         }
6879       }
6880       rdata.append(rs.str());
6881     }
6882   } else if (prefix == "osd crush weight-set dump") {
6883     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6884                                                      "json-pretty"));
6885     osdmap.crush->dump_choose_args(f.get());
6886     f->flush(rdata);
6887   } else if (prefix == "osd erasure-code-profile get") {
6888     string name;
6889     cmd_getval(cmdmap, "name", name);
6890     if (!osdmap.has_erasure_code_profile(name)) {
6891       ss << "unknown erasure code profile '" << name << "'";
6892       r = -ENOENT;
6893       goto reply;
6894     }
6895     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6896     if (f)
6897       f->open_object_section("profile");
6898     for (map<string,string>::const_iterator i = profile.begin();
6899          i != profile.end();
6900          ++i) {
6901       if (f)
6902         f->dump_string(i->first.c_str(), i->second.c_str());
6903       else
6904         rdata.append(i->first + "=" + i->second + "\n");
6905     }
6906     if (f) {
6907       f->close_section();
6908       ostringstream rs;
6909       f->flush(rs);
6910       rs << "\n";
6911       rdata.append(rs.str());
6912     }
6913   } else if (prefix == "osd pool application get") {
6914     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6915                                                      "json-pretty"));
6916     string pool_name;
6917     cmd_getval(cmdmap, "pool", pool_name);
6918     string app;
6919     cmd_getval(cmdmap, "app", app);
6920     string key;
6921     cmd_getval(cmdmap, "key", key);
6922
6923     if (pool_name.empty()) {
6924       // all
6925       f->open_object_section("pools");
6926       for (const auto &pool : osdmap.pools) {
6927         std::string name("<unknown>");
6928         const auto &pni = osdmap.pool_name.find(pool.first);
6929         if (pni != osdmap.pool_name.end())
6930           name = pni->second;
6931         f->open_object_section(name.c_str());
6932         for (auto &app_pair : pool.second.application_metadata) {
6933           f->open_object_section(app_pair.first.c_str());
6934           for (auto &kv_pair : app_pair.second) {
6935             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6936           }
6937           f->close_section();
6938         }
6939         f->close_section(); // name
6940       }
6941       f->close_section(); // pools
6942       f->flush(rdata);
6943     } else {
6944       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6945       if (pool < 0) {
6946         ss << "unrecognized pool '" << pool_name << "'";
6947         r = -ENOENT;
6948         goto reply;
6949       }
6950       auto p = osdmap.get_pg_pool(pool);
6951       // filter by pool
6952       if (app.empty()) {
6953         f->open_object_section(pool_name.c_str());
6954         for (auto &app_pair : p->application_metadata) {
6955           f->open_object_section(app_pair.first.c_str());
6956           for (auto &kv_pair : app_pair.second) {
6957             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6958           }
6959           f->close_section(); // application
6960         }
6961         f->close_section(); // pool_name
6962         f->flush(rdata);
6963         goto reply;
6964       }
6965
6966       auto app_it = p->application_metadata.find(app);
6967       if (app_it == p->application_metadata.end()) {
6968         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6969         r = -ENOENT;
6970         goto reply;
6971       }
6972       // filter by pool + app
6973       if (key.empty()) {
6974         f->open_object_section(app_it->first.c_str());
6975         for (auto &kv_pair : app_it->second) {
6976           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6977         }
6978         f->close_section(); // application
6979         f->flush(rdata);
6980         goto reply;
6981       }
6982       // filter by pool + app + key
6983       auto key_it = app_it->second.find(key);
6984       if (key_it == app_it->second.end()) {
6985         ss << "application '" << app << "' on pool '" << pool_name
6986            << "' does not have key '" << key << "'";
6987         r = -ENOENT;
6988         goto reply;
6989       }
6990       ss << key_it->second << "\n";
6991       rdata.append(ss.str());
6992       ss.str("");
6993     }
6994   } else if (prefix == "osd get-require-min-compat-client") {
6995     ss << osdmap.require_min_compat_client << std::endl;
6996     rdata.append(ss.str());
6997     ss.str("");
6998     goto reply;
6999   } else if (prefix == "osd pool application enable" ||
7000              prefix == "osd pool application disable" ||
7001              prefix == "osd pool application set" ||
7002              prefix == "osd pool application rm") {
7003     bool changed = false;
7004     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7005     if (r != 0) {
7006       // Error, reply.
7007       goto reply;
7008     } else if (changed) {
7009       // Valid mutation, proceed to prepare phase
7010       return false;
7011     } else {
7012       // Idempotent case, reply
7013       goto reply;
7014     }
7015   } else {
7016     // try prepare update
7017     return false;
7018   }
7019
7020  reply:
7021   string rs;
7022   getline(ss, rs);
7023   mon.reply_command(op, r, rs, rdata, get_last_committed());
7024   return true;
7025 }
7026
7027 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7028 {
7029   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7030     osdmap.get_pg_pool(pool_id));
7031   ceph_assert(pool);
7032   pool->set_flag(flags);
7033 }
7034
7035 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7036 {
7037   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7038     osdmap.get_pg_pool(pool_id));
7039   ceph_assert(pool);
7040   pool->unset_flag(flags);
7041 }
7042
7043 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7044 {
7045   char k[80];
7046   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7047   return k;
7048 }
7049
7050 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7051 {
7052   char k[80];
7053   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7054            (unsigned long long)pool, (unsigned long long)snap);
7055   return k;
7056 }
7057
7058 string OSDMonitor::make_purged_snap_key_value(
7059   int64_t pool, snapid_t snap, snapid_t num,
7060   epoch_t epoch, bufferlist *v)
7061 {
7062   // encode the *last* epoch in the key so that we can use forward
7063   // iteration only to search for an epoch in an interval.
7064   encode(snap, *v);
7065   encode(snap + num, *v);
7066   encode(epoch, *v);
7067   return make_purged_snap_key(pool, snap + num - 1);
7068 }
7069
7070
7071 int OSDMonitor::lookup_purged_snap(
7072   int64_t pool, snapid_t snap,
7073   snapid_t *begin, snapid_t *end)
7074 {
7075   string k = make_purged_snap_key(pool, snap);
7076   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7077   it->lower_bound(k);
7078   if (!it->valid()) {
7079     dout(20) << __func__
7080              << " pool " << pool << " snap " << snap
7081              << " - key '" << k << "' not found" << dendl;
7082     return -ENOENT;
7083   }
7084   if (it->key().find("purged_snap_") != 0) {
7085     dout(20) << __func__
7086              << " pool " << pool << " snap " << snap
7087              << " - key '" << k << "' got '" << it->key()
7088              << "', wrong prefix" << dendl;
7089     return -ENOENT;
7090   }
7091   string gotk = it->key();
7092   const char *format = "purged_snap_%llu_";
7093   long long int keypool;
7094   int n = sscanf(gotk.c_str(), format, &keypool);
7095   if (n != 1) {
7096     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7097     return -ENOENT;
7098   }
7099   if (pool != keypool) {
7100     dout(20) << __func__
7101              << " pool " << pool << " snap " << snap
7102              << " - key '" << k << "' got '" << gotk
7103              << "', wrong pool " << keypool
7104              << dendl;
7105     return -ENOENT;
7106   }
7107   bufferlist v = it->value();
7108   auto p = v.cbegin();
7109   decode(*begin, p);
7110   decode(*end, p);
7111   if (snap < *begin || snap >= *end) {
7112     dout(20) << __func__
7113              << " pool " << pool << " snap " << snap
7114              << " - found [" << *begin << "," << *end << "), no overlap"
7115              << dendl;
7116     return -ENOENT;
7117   }
7118   return 0;
7119 }
7120
7121 void OSDMonitor::insert_purged_snap_update(
7122   int64_t pool,
7123   snapid_t start, snapid_t end,
7124   epoch_t epoch,
7125   MonitorDBStore::TransactionRef t)
7126 {
7127   snapid_t before_begin, before_end;
7128   snapid_t after_begin, after_end;
7129   int b = lookup_purged_snap(pool, start - 1,
7130                              &before_begin, &before_end);
7131   int a = lookup_purged_snap(pool, end,
7132                              &after_begin, &after_end);
7133   if (!b && !a) {
7134     dout(10) << __func__
7135              << " [" << start << "," << end << ") - joins ["
7136              << before_begin << "," << before_end << ") and ["
7137              << after_begin << "," << after_end << ")" << dendl;
7138     // erase only the begin record; we'll overwrite the end one.
7139     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7140     bufferlist v;
7141     string k = make_purged_snap_key_value(pool,
7142                                           before_begin, after_end - before_begin,
7143                                           pending_inc.epoch, &v);
7144     t->put(OSD_SNAP_PREFIX, k, v);
7145   } else if (!b) {
7146     dout(10) << __func__
7147              << " [" << start << "," << end << ") - join with earlier ["
7148              << before_begin << "," << before_end << ")" << dendl;
7149     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7150     bufferlist v;
7151     string k = make_purged_snap_key_value(pool,
7152                                           before_begin, end - before_begin,
7153                                           pending_inc.epoch, &v);
7154     t->put(OSD_SNAP_PREFIX, k, v);
7155   } else if (!a) {
7156     dout(10) << __func__
7157              << " [" << start << "," << end << ") - join with later ["
7158              << after_begin << "," << after_end << ")" << dendl;
7159     // overwrite after record
7160     bufferlist v;
7161     string k = make_purged_snap_key_value(pool,
7162                                           start, after_end - start,
7163                                           pending_inc.epoch, &v);
7164     t->put(OSD_SNAP_PREFIX, k, v);
7165   } else {
7166     dout(10) << __func__
7167              << " [" << start << "," << end << ") - new"
7168              << dendl;
7169     bufferlist v;
7170     string k = make_purged_snap_key_value(pool,
7171                                           start, end - start,
7172                                           pending_inc.epoch, &v);
7173     t->put(OSD_SNAP_PREFIX, k, v);
7174   }
7175 }
7176
7177 bool OSDMonitor::try_prune_purged_snaps()
7178 {
7179   if (!mon.mgrstatmon()->is_readable()) {
7180     return false;
7181   }
7182   if (!pending_inc.new_purged_snaps.empty()) {
7183     return false;  // we already pruned for this epoch
7184   }
7185
7186   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7187     "mon_max_snap_prune_per_epoch");
7188   if (!max_prune) {
7189     max_prune = 100000;
7190   }
7191   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7192
7193   unsigned actually_pruned = 0;
7194   auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7195   for (auto& p : osdmap.get_pools()) {
7196     auto q = purged_snaps.find(p.first);
7197     if (q == purged_snaps.end()) {
7198       continue;
7199     }
7200     auto& purged = q->second;
7201     if (purged.empty()) {
7202       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7203       continue;
7204     }
7205     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7206     snap_interval_set_t to_prune;
7207     unsigned maybe_pruned = actually_pruned;
7208     for (auto i = purged.begin(); i != purged.end(); ++i) {
7209       snapid_t begin = i.get_start();
7210       auto end = i.get_start() + i.get_len();
7211       snapid_t pbegin = 0, pend = 0;
7212       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7213       if (r == 0) {
7214         // already purged.
7215         // be a bit aggressive about backing off here, because the mon may
7216         // do a lot of work going through this set, and if we know the
7217         // purged set from the OSDs is at least *partly* stale we may as
7218         // well wait for it to be fresh.
7219         dout(20) << __func__ << "  we've already purged " << pbegin
7220                  << "~" << (pend - pbegin) << dendl;
7221         break;  // next pool
7222       }
7223       if (pbegin && pbegin > begin && pbegin < end) {
7224         // the tail of [begin,end) is purged; shorten the range
7225         end = pbegin;
7226       }
7227       to_prune.insert(begin, end - begin);
7228       maybe_pruned += end - begin;
7229       if (maybe_pruned >= max_prune) {
7230         break;
7231       }
7232     }
7233     if (!to_prune.empty()) {
7234       // PGs may still be reporting things as purged that we have already
7235       // pruned from removed_snaps_queue.
7236       snap_interval_set_t actual;
7237       auto r = osdmap.removed_snaps_queue.find(p.first);
7238       if (r != osdmap.removed_snaps_queue.end()) {
7239         actual.intersection_of(to_prune, r->second);
7240       }
7241       actually_pruned += actual.size();
7242       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7243                << ", actual pruned " << actual << dendl;
7244       if (!actual.empty()) {
7245         pending_inc.new_purged_snaps[p.first].swap(actual);
7246       }
7247     }
7248     if (actually_pruned >= max_prune) {
7249       break;
7250     }
7251   }
7252   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7253   return !!actually_pruned;
7254 }
7255
7256 bool OSDMonitor::update_pools_status()
7257 {
7258   if (!mon.mgrstatmon()->is_readable())
7259     return false;
7260
7261   bool ret = false;
7262
7263   auto& pools = osdmap.get_pools();
7264   for (auto it = pools.begin(); it != pools.end(); ++it) {
7265     const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7266     if (!pstat)
7267       continue;
7268     const object_stat_sum_t& sum = pstat->stats.sum;
7269     const pg_pool_t &pool = it->second;
7270     const string& pool_name = osdmap.get_pool_name(it->first);
7271
7272     bool pool_is_full =
7273       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7274       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7275
7276     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7277       if (pool_is_full)
7278         continue;
7279
7280       mon.clog->info() << "pool '" << pool_name
7281                        << "' no longer out of quota; removing NO_QUOTA flag";
7282       // below we cancel FLAG_FULL too, we'll set it again in
7283       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7284       clear_pool_flags(it->first,
7285                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7286       ret = true;
7287     } else {
7288       if (!pool_is_full)
7289         continue;
7290
7291       if (pool.quota_max_bytes > 0 &&
7292           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7293         mon.clog->warn() << "pool '" << pool_name << "' is full"
7294                          << " (reached quota's max_bytes: "
7295                          << byte_u_t(pool.quota_max_bytes) << ")";
7296       }
7297       if (pool.quota_max_objects > 0 &&
7298                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7299         mon.clog->warn() << "pool '" << pool_name << "' is full"
7300                          << " (reached quota's max_objects: "
7301                          << pool.quota_max_objects << ")";
7302       }
7303       // set both FLAG_FULL_QUOTA and FLAG_FULL
7304       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7305       // since FLAG_FULL should always take precedence
7306       set_pool_flags(it->first,
7307                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7308       clear_pool_flags(it->first,
7309                        pg_pool_t::FLAG_NEARFULL |
7310                        pg_pool_t::FLAG_BACKFILLFULL);
7311       ret = true;
7312     }
7313   }
7314   return ret;
7315 }
7316
7317 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7318 {
7319   op->mark_osdmon_event(__func__);
7320   auto m = op->get_req<MPoolOp>();
7321   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7322   MonSession *session = op->get_session();
7323   if (!session)
7324     return -EPERM;
7325   string erasure_code_profile;
7326   stringstream ss;
7327   string rule_name;
7328   bool bulk = false;
7329   int ret = 0;
7330   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7331                          0, 0, 0, 0, 0, 0, 0.0,
7332                          erasure_code_profile,
7333                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7334                          &ss);
7335
7336   if (ret < 0) {
7337     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7338   }
7339   return ret;
7340 }
7341
7342 int OSDMonitor::crush_rename_bucket(const string& srcname,
7343                                     const string& dstname,
7344                                     ostream *ss)
7345 {
7346   int ret;
7347   //
7348   // Avoid creating a pending crush if it does not already exists and
7349   // the rename would fail.
7350   //
7351   if (!_have_pending_crush()) {
7352     ret = _get_stable_crush().can_rename_bucket(srcname,
7353                                                 dstname,
7354                                                 ss);
7355     if (ret)
7356       return ret;
7357   }
7358
7359   CrushWrapper newcrush = _get_pending_crush();
7360
7361   ret = newcrush.rename_bucket(srcname,
7362                                dstname,
7363                                ss);
7364   if (ret)
7365     return ret;
7366
7367   pending_inc.crush.clear();
7368   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7369   *ss << "renamed bucket " << srcname << " into " << dstname;
7370   return 0;
7371 }
7372
7373 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7374 {
7375   string replacement = "";
7376
7377   if (plugin == "jerasure_generic" ||
7378       plugin == "jerasure_sse3" ||
7379       plugin == "jerasure_sse4" ||
7380       plugin == "jerasure_neon") {
7381     replacement = "jerasure";
7382   } else if (plugin == "shec_generic" ||
7383              plugin == "shec_sse3" ||
7384              plugin == "shec_sse4" ||
7385              plugin == "shec_neon") {
7386     replacement = "shec";
7387   }
7388
7389   if (replacement != "") {
7390     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7391             << plugin << " that has been deprecated. Please use "
7392             << replacement << " instead." << dendl;
7393   }
7394 }
7395
7396 int OSDMonitor::normalize_profile(const string& profilename,
7397                                   ErasureCodeProfile &profile,
7398                                   bool force,
7399                                   ostream *ss)
7400 {
7401   ErasureCodeInterfaceRef erasure_code;
7402   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7403   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7404   check_legacy_ec_plugin(plugin->second, profilename);
7405   int err = instance.factory(plugin->second,
7406                              g_conf().get_val<std::string>("erasure_code_dir"),
7407                              profile, &erasure_code, ss);
7408   if (err) {
7409     return err;
7410   }
7411
7412   err = erasure_code->init(profile, ss);
7413   if (err) {
7414     return err;
7415   }
7416
7417   auto it = profile.find("stripe_unit");
7418   if (it != profile.end()) {
7419     string err_str;
7420     uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7421     if (!err_str.empty()) {
7422       *ss << "could not parse stripe_unit '" << it->second
7423           << "': " << err_str << std::endl;
7424       return -EINVAL;
7425     }
7426     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7427     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7428     if (chunk_size != stripe_unit) {
7429       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7430           << "alignment. Would be padded to " << chunk_size
7431           << std::endl;
7432       return -EINVAL;
7433     }
7434     if ((stripe_unit % 4096) != 0 && !force) {
7435       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7436           << "use --force to override this check" << std::endl;
7437       return -EINVAL;
7438     }
7439   }
7440   return 0;
7441 }
7442
7443 int OSDMonitor::crush_rule_create_erasure(const string &name,
7444                                              const string &profile,
7445                                              int *rule,
7446                                              ostream *ss)
7447 {
7448   int ruleid = osdmap.crush->get_rule_id(name);
7449   if (ruleid != -ENOENT) {
7450     *rule = ruleid;
7451     return -EEXIST;
7452   }
7453
7454   CrushWrapper newcrush = _get_pending_crush();
7455
7456   ruleid = newcrush.get_rule_id(name);
7457   if (ruleid != -ENOENT) {
7458     *rule = ruleid;
7459     return -EALREADY;
7460   } else {
7461     ErasureCodeInterfaceRef erasure_code;
7462     int err = get_erasure_code(profile, &erasure_code, ss);
7463     if (err) {
7464       *ss << "failed to load plugin using profile " << profile << std::endl;
7465       return err;
7466     }
7467
7468     err = erasure_code->create_rule(name, newcrush, ss);
7469     erasure_code.reset();
7470     if (err < 0)
7471       return err;
7472     *rule = err;
7473     pending_inc.crush.clear();
7474     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7475     return 0;
7476   }
7477 }
7478
7479 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7480                                  ErasureCodeInterfaceRef *erasure_code,
7481                                  ostream *ss) const
7482 {
7483   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7484     return -EAGAIN;
7485   ErasureCodeProfile profile =
7486     osdmap.get_erasure_code_profile(erasure_code_profile);
7487   ErasureCodeProfile::const_iterator plugin =
7488     profile.find("plugin");
7489   if (plugin == profile.end()) {
7490     *ss << "cannot determine the erasure code plugin"
7491         << " because there is no 'plugin' entry in the erasure_code_profile "
7492         << profile << std::endl;
7493     return -EINVAL;
7494   }
7495   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7496   auto& instance = ErasureCodePluginRegistry::instance();
7497   return instance.factory(plugin->second,
7498                           g_conf().get_val<std::string>("erasure_code_dir"),
7499                           profile, erasure_code, ss);
7500 }
7501
7502 int OSDMonitor::check_cluster_features(uint64_t features,
7503                                        stringstream &ss)
7504 {
7505   stringstream unsupported_ss;
7506   int unsupported_count = 0;
7507   if ((mon.get_quorum_con_features() & features) != features) {
7508     unsupported_ss << "the monitor cluster";
7509     ++unsupported_count;
7510   }
7511
7512   set<int32_t> up_osds;
7513   osdmap.get_up_osds(up_osds);
7514   for (set<int32_t>::iterator it = up_osds.begin();
7515        it != up_osds.end(); ++it) {
7516     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7517     if ((xi.features & features) != features) {
7518       if (unsupported_count > 0)
7519         unsupported_ss << ", ";
7520       unsupported_ss << "osd." << *it;
7521       unsupported_count ++;
7522     }
7523   }
7524
7525   if (unsupported_count > 0) {
7526     ss << "features " << features << " unsupported by: "
7527        << unsupported_ss.str();
7528     return -ENOTSUP;
7529   }
7530
7531   // check pending osd state, too!
7532   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7533          pending_inc.new_xinfo.begin();
7534        p != pending_inc.new_xinfo.end(); ++p) {
7535     const osd_xinfo_t &xi = p->second;
7536     if ((xi.features & features) != features) {
7537       dout(10) << __func__ << " pending osd." << p->first
7538                << " features are insufficient; retry" << dendl;
7539       return -EAGAIN;
7540     }
7541   }
7542
7543   return 0;
7544 }
7545
7546 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7547                                                  stringstream& ss)
7548 {
7549   OSDMap::Incremental new_pending = pending_inc;
7550   encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7551   OSDMap newmap;
7552   newmap.deepish_copy_from(osdmap);
7553   newmap.apply_incremental(new_pending);
7554
7555   // client compat
7556   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7557     auto mv = newmap.get_min_compat_client();
7558     if (mv > newmap.require_min_compat_client) {
7559       ss << "new crush map requires client version " << mv
7560          << " but require_min_compat_client is "
7561          << newmap.require_min_compat_client;
7562       return false;
7563     }
7564   }
7565
7566   // osd compat
7567   uint64_t features =
7568     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7569     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7570   stringstream features_ss;
7571   int r = check_cluster_features(features, features_ss);
7572   if (r) {
7573     ss << "Could not change CRUSH: " << features_ss.str();
7574     return false;
7575   }
7576
7577   return true;
7578 }
7579
7580 bool OSDMonitor::erasure_code_profile_in_use(
7581   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7582   const string &profile,
7583   ostream *ss)
7584 {
7585   bool found = false;
7586   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7587        p != pools.end();
7588        ++p) {
7589     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7590       *ss << osdmap.pool_name[p->first] << " ";
7591       found = true;
7592     }
7593   }
7594   if (found) {
7595     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7596   }
7597   return found;
7598 }
7599
7600 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7601                                            map<string,string> *erasure_code_profile_map,
7602                                            ostream *ss)
7603 {
7604   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7605                                    get_json_str_map,
7606                                    *ss,
7607                                    erasure_code_profile_map,
7608                                    true);
7609   if (r)
7610     return r;
7611   ceph_assert((*erasure_code_profile_map).count("plugin"));
7612   string default_plugin = (*erasure_code_profile_map)["plugin"];
7613   map<string,string> user_map;
7614   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7615        i != erasure_code_profile.end();
7616        ++i) {
7617     size_t equal = i->find('=');
7618     if (equal == string::npos) {
7619       user_map[*i] = string();
7620       (*erasure_code_profile_map)[*i] = string();
7621     } else {
7622       const string key = i->substr(0, equal);
7623       equal++;
7624       const string value = i->substr(equal);
7625       if (key.find("ruleset-") == 0) {
7626         *ss << "property '" << key << "' is no longer supported; try "
7627             << "'crush-" << key.substr(8) << "' instead";
7628         return -EINVAL;
7629       }
7630       user_map[key] = value;
7631       (*erasure_code_profile_map)[key] = value;
7632     }
7633   }
7634
7635   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7636     (*erasure_code_profile_map) = user_map;
7637
7638   return 0;
7639 }
7640
7641 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7642                                   const string &erasure_code_profile,
7643                                   uint8_t repl_size,
7644                                   unsigned *size, unsigned *min_size,
7645                                   ostream *ss)
7646 {
7647   int err = 0;
7648   bool set_min_size = false;
7649   switch (pool_type) {
7650   case pg_pool_t::TYPE_REPLICATED:
7651     if (osdmap.stretch_mode_enabled) {
7652       if (repl_size == 0)
7653         repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7654       if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7655         *ss << "prepare_pool_size: we are in stretch mode but size "
7656            << repl_size << " does not match!";
7657         return -EINVAL;
7658       }
7659       *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7660       set_min_size = true;
7661     }
7662     if (repl_size == 0) {
7663       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7664     }
7665     *size = repl_size;
7666     if (!set_min_size)
7667       *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7668     break;
7669   case pg_pool_t::TYPE_ERASURE:
7670     {
7671       if (osdmap.stretch_mode_enabled) {
7672         *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7673         return -EINVAL;
7674       }
7675       ErasureCodeInterfaceRef erasure_code;
7676       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7677       if (err == 0) {
7678         *size = erasure_code->get_chunk_count();
7679         *min_size =
7680           erasure_code->get_data_chunk_count() +
7681           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7682         assert(*min_size <= *size);
7683         assert(*min_size >= erasure_code->get_data_chunk_count());
7684       }
7685     }
7686     break;
7687   default:
7688     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7689     err = -EINVAL;
7690     break;
7691   }
7692   return err;
7693 }
7694
7695 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7696                                           const string &erasure_code_profile,
7697                                           uint32_t *stripe_width,
7698                                           ostream *ss)
7699 {
7700   int err = 0;
7701   switch (pool_type) {
7702   case pg_pool_t::TYPE_REPLICATED:
7703     // ignored
7704     break;
7705   case pg_pool_t::TYPE_ERASURE:
7706     {
7707       ErasureCodeProfile profile =
7708         osdmap.get_erasure_code_profile(erasure_code_profile);
7709       ErasureCodeInterfaceRef erasure_code;
7710       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7711       if (err)
7712         break;
7713       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7714       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7715       auto it = profile.find("stripe_unit");
7716       if (it != profile.end()) {
7717         string err_str;
7718         stripe_unit = strict_iecstrtoll(it->second, &err_str);
7719         ceph_assert(err_str.empty());
7720       }
7721       *stripe_width = data_chunks *
7722         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7723     }
7724     break;
7725   default:
7726     *ss << "prepare_pool_stripe_width: "
7727        << pool_type << " is not a known pool type";
7728     err = -EINVAL;
7729     break;
7730   }
7731   return err;
7732 }
7733
7734 int OSDMonitor::get_replicated_stretch_crush_rule()
7735 {
7736   /* we don't write down the stretch rule anywhere, so
7737    * we have to guess it. How? Look at all the pools
7738    * and count up how many times a given rule is used
7739    * on stretch pools and then return the one with
7740    * the most users!
7741    */
7742   map<int,int> rule_counts;
7743   for (const auto& pooli : osdmap.pools) {
7744     const pg_pool_t& p = pooli.second;
7745     if (p.is_replicated() && p.is_stretch_pool()) {
7746       if (!rule_counts.count(p.crush_rule)) {
7747         rule_counts[p.crush_rule] = 1;
7748       } else {
7749         ++rule_counts[p.crush_rule];
7750       }
7751     }
7752   }
7753
7754   if (rule_counts.empty()) {
7755     return -ENOENT;
7756   }
7757
7758   int most_used_count = 0;
7759   int most_used_rule = -1;
7760   for (auto i : rule_counts) {
7761     if (i.second > most_used_count) {
7762       most_used_rule = i.first;
7763       most_used_count = i.second;
7764     }
7765   }
7766   ceph_assert(most_used_count > 0);
7767   ceph_assert(most_used_rule >= 0);
7768   return most_used_rule;
7769 }
7770
7771 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7772                                         const string &erasure_code_profile,
7773                                         const string &rule_name,
7774                                         int *crush_rule,
7775                                         ostream *ss)
7776 {
7777
7778   if (*crush_rule < 0) {
7779     switch (pool_type) {
7780     case pg_pool_t::TYPE_REPLICATED:
7781       {
7782         if (rule_name == "") {
7783           if (osdmap.stretch_mode_enabled) {
7784             *crush_rule = get_replicated_stretch_crush_rule();
7785           } else {
7786             // Use default rule
7787             *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7788           }
7789           if (*crush_rule < 0) {
7790             // Errors may happen e.g. if no valid rule is available
7791             *ss << "No suitable CRUSH rule exists, check "
7792                 << "'osd pool default crush *' config options";
7793             return -ENOENT;
7794           }
7795         } else {
7796           return get_crush_rule(rule_name, crush_rule, ss);
7797         }
7798       }
7799       break;
7800     case pg_pool_t::TYPE_ERASURE:
7801       {
7802         int err = crush_rule_create_erasure(rule_name,
7803                                                erasure_code_profile,
7804                                                crush_rule, ss);
7805         switch (err) {
7806         case -EALREADY:
7807           dout(20) << "prepare_pool_crush_rule: rule "
7808                    << rule_name << " try again" << dendl;
7809           // fall through
7810         case 0:
7811           // need to wait for the crush rule to be proposed before proceeding
7812           err = -EAGAIN;
7813           break;
7814         case -EEXIST:
7815           err = 0;
7816           break;
7817         }
7818         return err;
7819       }
7820       break;
7821     default:
7822       *ss << "prepare_pool_crush_rule: " << pool_type
7823          << " is not a known pool type";
7824       return -EINVAL;
7825     }
7826   } else {
7827     if (!osdmap.crush->rule_exists(*crush_rule)) {
7828       *ss << "CRUSH rule " << *crush_rule << " not found";
7829       return -ENOENT;
7830     }
7831   }
7832
7833   return 0;
7834 }
7835
7836 int OSDMonitor::get_crush_rule(const string &rule_name,
7837                                int *crush_rule,
7838                                ostream *ss)
7839 {
7840   int ret;
7841   ret = osdmap.crush->get_rule_id(rule_name);
7842   if (ret != -ENOENT) {
7843     // found it, use it
7844     *crush_rule = ret;
7845   } else {
7846     CrushWrapper newcrush = _get_pending_crush();
7847
7848     ret = newcrush.get_rule_id(rule_name);
7849     if (ret != -ENOENT) {
7850       // found it, wait for it to be proposed
7851       dout(20) << __func__ << ": rule " << rule_name
7852                << " try again" << dendl;
7853       return -EAGAIN;
7854     } else {
7855       // Cannot find it , return error
7856       *ss << "specified rule " << rule_name << " doesn't exist";
7857       return ret;
7858     }
7859   }
7860   return 0;
7861 }
7862
7863 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, ostream *ss)
7864 {
7865   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7866   uint64_t projected = 0;
7867   unsigned osd_num = 0;
7868   // assume min cluster size 3
7869   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);
7870   if (pool < 0) {
7871     // a new pool
7872     projected += pg_num * size;
7873   }
7874   if (mapping.get_epoch() >= osdmap.get_epoch()) {
7875     set<int> roots;
7876     CrushWrapper newcrush = _get_pending_crush();
7877     newcrush.find_takes_by_rule(crush_rule, &roots);
7878     int max_osd = osdmap.get_max_osd();
7879     for (auto root : roots) {
7880       const char *rootname = newcrush.get_item_name(root);
7881       set<int> osd_ids;
7882       newcrush.get_leaves(rootname, &osd_ids);
7883       unsigned out_osd = 0;
7884       for (auto id : osd_ids) {
7885         if (id > max_osd) {
7886           out_osd++;
7887           continue;
7888         }
7889         projected += mapping.get_osd_acting_pgs(id).size();
7890       }
7891       osd_num += osd_ids.size() - out_osd;
7892     }
7893     if (pool >= 0) {
7894       // update an existing pool's pg num
7895       const auto& pg_info = osdmap.get_pools().at(pool);
7896       // already counted the pgs of this `pool` by iterating crush map, so
7897       // remove them using adding the specified pg num
7898       projected += pg_num * size;
7899       projected -= pg_info.get_pg_num_target() * pg_info.get_size();
7900     }
7901     num_osds = std::max(osd_num, 3u);  // assume min cluster size 3
7902   } else {
7903     // use pg_num target for evaluating the projected pg num
7904     for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7905       if (pool_id == pool) {
7906         projected += pg_num * size;
7907       } else {
7908         projected += pool_info.get_pg_num_target() * pool_info.get_size();
7909       }
7910     }
7911   }
7912   auto max_pgs = max_pgs_per_osd * num_osds;
7913   if (projected > max_pgs) {
7914     if (pool >= 0) {
7915       *ss << "pool id " << pool;
7916     }
7917     *ss << " pg_num " << pg_num << " size " << size
7918         << " would mean " << projected
7919         << " total pgs, which exceeds max " << max_pgs
7920         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7921         << " * num_in_osds " << num_osds << ")";
7922     return -ERANGE;
7923   }
7924   return 0;
7925 }
7926
7927 /**
7928  * @param name The name of the new pool
7929  * @param crush_rule The crush rule to use. If <0, will use the system default
7930  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7931  * @param pg_num The pg_num to use. If set to 0, will use the system default
7932  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7933  * @param pg_num_min min pg_num
7934  * @param pg_num_max max pg_num
7935  * @param repl_size Replication factor, or 0 for default
7936  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7937  * @param pool_type TYPE_ERASURE, or TYPE_REP
7938  * @param expected_num_objects expected number of objects on the pool
7939  * @param fast_read fast read type.
7940  * @param ss human readable error message, if any.
7941  *
7942  * @return 0 on success, negative errno on failure.
7943  */
7944 int OSDMonitor::prepare_new_pool(string& name,
7945                                  int crush_rule,
7946                                  const string &crush_rule_name,
7947                                  unsigned pg_num, unsigned pgp_num,
7948                                  unsigned pg_num_min,
7949                                  unsigned pg_num_max,
7950                                  const uint64_t repl_size,
7951                                  const uint64_t target_size_bytes,
7952                                  const float target_size_ratio,
7953                                  const string &erasure_code_profile,
7954                                  const unsigned pool_type,
7955                                  const uint64_t expected_num_objects,
7956                                  FastReadType fast_read,
7957                                  const string& pg_autoscale_mode,
7958                                  bool bulk,
7959                                  ostream *ss)
7960 {
7961   if (name.length() == 0)
7962     return -EINVAL;
7963   if (pg_num == 0) {
7964     auto pg_num_from_mode =
7965       [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
7966       (const string& mode) {
7967       return mode == "on" ? 1 : pg_num;
7968     };
7969     pg_num = pg_num_from_mode(
7970       pg_autoscale_mode.empty() ?
7971       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
7972       pg_autoscale_mode);
7973   }
7974   if (pgp_num == 0)
7975     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7976   if (!pgp_num)
7977     pgp_num = pg_num;
7978   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7979     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7980         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7981         << " (you may adjust 'mon max pool pg num' for higher values)";
7982     return -ERANGE;
7983   }
7984   if (pgp_num > pg_num) {
7985     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7986         << ", which in this case is " << pg_num;
7987     return -ERANGE;
7988   }
7989   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7990     *ss << "'fast_read' can only apply to erasure coding pool";
7991     return -EINVAL;
7992   }
7993   int r;
7994   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7995                                  crush_rule_name, &crush_rule, ss);
7996   if (r) {
7997     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7998     return r;
7999   }
8000   unsigned size, min_size;
8001   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8002                         &size, &min_size, ss);
8003   if (r) {
8004     dout(10) << "prepare_pool_size returns " << r << dendl;
8005     return r;
8006   }
8007   if (g_conf()->mon_osd_crush_smoke_test) {
8008     CrushWrapper newcrush = _get_pending_crush();
8009     ostringstream err;
8010     CrushTester tester(newcrush, err);
8011     tester.set_min_x(0);
8012     tester.set_max_x(50);
8013     tester.set_rule(crush_rule);
8014     tester.set_num_rep(size);
8015     auto start = ceph::coarse_mono_clock::now();
8016     r = tester.test_with_fork(g_conf()->mon_lease);
8017     auto duration = ceph::coarse_mono_clock::now() - start;
8018     if (r < 0) {
8019       dout(10) << "tester.test_with_fork returns " << r
8020                << ": " << err.str() << dendl;
8021       *ss << "crush test failed with " << r << ": " << err.str();
8022       return r;
8023     }
8024     dout(10) << __func__ << " crush smoke test duration: "
8025              << duration << dendl;
8026   }
8027   r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8028   if (r) {
8029     dout(10) << "check_pg_num returns " << r << dendl;
8030     return r;
8031   }
8032
8033   if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8034     *ss << "crush rule " << crush_rule << " type does not match pool";
8035     return -EINVAL;
8036   }
8037
8038   uint32_t stripe_width = 0;
8039   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8040   if (r) {
8041     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8042     return r;
8043   }
8044
8045   bool fread = false;
8046   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8047     switch (fast_read) {
8048       case FAST_READ_OFF:
8049         fread = false;
8050         break;
8051       case FAST_READ_ON:
8052         fread = true;
8053         break;
8054       case FAST_READ_DEFAULT:
8055         fread = g_conf()->osd_pool_default_ec_fast_read;
8056         break;
8057       default:
8058         *ss << "invalid fast_read setting: " << fast_read;
8059         return -EINVAL;
8060     }
8061   }
8062
8063   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8064        p != pending_inc.new_pool_names.end();
8065        ++p) {
8066     if (p->second == name)
8067       return 0;
8068   }
8069
8070   if (-1 == pending_inc.new_pool_max)
8071     pending_inc.new_pool_max = osdmap.pool_max;
8072   int64_t pool = ++pending_inc.new_pool_max;
8073   pg_pool_t empty;
8074   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8075   pi->create_time = ceph_clock_now();
8076   pi->type = pool_type;
8077   pi->fast_read = fread;
8078   pi->flags = g_conf()->osd_pool_default_flags;
8079   if (bulk) {
8080     pi->set_flag(pg_pool_t::FLAG_BULK);
8081   } else if (g_conf()->osd_pool_default_flag_bulk) {
8082       pi->set_flag(pg_pool_t::FLAG_BULK);
8083   }
8084   if (g_conf()->osd_pool_default_flag_hashpspool)
8085     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8086   if (g_conf()->osd_pool_default_flag_nodelete)
8087     pi->set_flag(pg_pool_t::FLAG_NODELETE);
8088   if (g_conf()->osd_pool_default_flag_nopgchange)
8089     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8090   if (g_conf()->osd_pool_default_flag_nosizechange)
8091     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8092   pi->set_flag(pg_pool_t::FLAG_CREATING);
8093   if (g_conf()->osd_pool_use_gmt_hitset)
8094     pi->use_gmt_hitset = true;
8095   else
8096     pi->use_gmt_hitset = false;
8097
8098   pi->size = size;
8099   pi->min_size = min_size;
8100   pi->crush_rule = crush_rule;
8101   pi->expected_num_objects = expected_num_objects;
8102   pi->object_hash = CEPH_STR_HASH_RJENKINS;
8103   if (osdmap.stretch_mode_enabled) {
8104     pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8105     pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8106     pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8107     pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8108     if (osdmap.degraded_stretch_mode) {
8109       pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8110       pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8111       // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8112       // TODO: drat, we don't record this ^ anywhere, though given that it
8113       // necessarily won't exist elsewhere it likely doesn't matter
8114       pi->min_size = pi->min_size / 2;
8115       pi->size = pi->size / 2; // only support 2 zones now
8116     }
8117   }
8118
8119   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8120         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8121       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8122     pi->pg_autoscale_mode = m;
8123   } else {
8124     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8125   }
8126   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8127   pi->set_pg_num(
8128     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8129     : pg_num);
8130   pi->set_pg_num_pending(pi->get_pg_num());
8131   pi->set_pg_num_target(pg_num);
8132   pi->set_pgp_num(pi->get_pg_num());
8133   pi->set_pgp_num_target(pgp_num);
8134   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8135       pg_num_min) {
8136     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8137   }
8138   if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8139       pg_num_max) {
8140     pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8141   }
8142   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8143         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8144     pi->pg_autoscale_mode = m;
8145   }
8146
8147   pi->last_change = pending_inc.epoch;
8148   pi->auid = 0;
8149
8150   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8151       pi->erasure_code_profile = erasure_code_profile;
8152   } else {
8153       pi->erasure_code_profile = "";
8154   }
8155   pi->stripe_width = stripe_width;
8156
8157   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8158       target_size_bytes) {
8159     // only store for nautilus+ because TARGET_SIZE_BYTES may be
8160     // larger than int32_t max.
8161     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8162   }
8163   if (target_size_ratio > 0.0 &&
8164       osdmap.require_osd_release >= ceph_release_t::nautilus) {
8165     // only store for nautilus+, just to be consistent and tidy.
8166     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8167   }
8168
8169   pi->cache_target_dirty_ratio_micro =
8170     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8171   pi->cache_target_dirty_high_ratio_micro =
8172     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8173   pi->cache_target_full_ratio_micro =
8174     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8175   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8176   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8177
8178   pending_inc.new_pool_names[pool] = name;
8179   return 0;
8180 }
8181
8182 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8183 {
8184   op->mark_osdmon_event(__func__);
8185   ostringstream ss;
8186   if (pending_inc.new_flags < 0)
8187     pending_inc.new_flags = osdmap.get_flags();
8188   pending_inc.new_flags |= flag;
8189   ss << OSDMap::get_flag_string(flag) << " is set";
8190   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8191                                                     get_last_committed() + 1));
8192   return true;
8193 }
8194
8195 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8196 {
8197   op->mark_osdmon_event(__func__);
8198   ostringstream ss;
8199   if (pending_inc.new_flags < 0)
8200     pending_inc.new_flags = osdmap.get_flags();
8201   pending_inc.new_flags &= ~flag;
8202   ss << OSDMap::get_flag_string(flag) << " is unset";
8203   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8204                                                     get_last_committed() + 1));
8205   return true;
8206 }
8207
8208 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8209                                          stringstream& ss)
8210 {
8211   string poolstr;
8212   cmd_getval(cmdmap, "pool", poolstr);
8213   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8214   if (pool < 0) {
8215     ss << "unrecognized pool '" << poolstr << "'";
8216     return -ENOENT;
8217   }
8218   string var;
8219   cmd_getval(cmdmap, "var", var);
8220
8221   pg_pool_t p = *osdmap.get_pg_pool(pool);
8222   if (pending_inc.new_pools.count(pool))
8223     p = pending_inc.new_pools[pool];
8224
8225   // accept val as a json string in the normal case (current
8226   // generation monitor).  parse out int or float values from the
8227   // string as needed.  however, if it is not a string, try to pull
8228   // out an int, in case an older monitor with an older json schema is
8229   // forwarding a request.
8230   string val;
8231   string interr, floaterr;
8232   int64_t n = 0;
8233   double f = 0;
8234   int64_t uf = 0;  // micro-f
8235   cmd_getval(cmdmap, "val", val);
8236
8237   auto si_options = {
8238     "target_max_objects"
8239   };
8240   auto iec_options = {
8241     "target_max_bytes",
8242     "target_size_bytes",
8243     "compression_max_blob_size",
8244     "compression_min_blob_size",
8245     "csum_max_block",
8246     "csum_min_block",
8247   };
8248   if (count(begin(si_options), end(si_options), var)) {
8249     n = strict_si_cast<int64_t>(val, &interr);
8250   } else if (count(begin(iec_options), end(iec_options), var)) {
8251     n = strict_iec_cast<int64_t>(val, &interr);
8252   } else {
8253     // parse string as both int and float; different fields use different types.
8254     n = strict_strtoll(val.c_str(), 10, &interr);
8255     f = strict_strtod(val.c_str(), &floaterr);
8256     uf = llrintl(f * (double)1000000.0);
8257   }
8258
8259   if (!p.is_tier() &&
8260       (var == "hit_set_type" || var == "hit_set_period" ||
8261        var == "hit_set_count" || var == "hit_set_fpp" ||
8262        var == "target_max_objects" || var == "target_max_bytes" ||
8263        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8264        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8265        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8266        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8267        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8268     return -EACCES;
8269   }
8270
8271   if (var == "size") {
8272     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8273       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8274       return -EPERM;
8275     }
8276     if (p.type == pg_pool_t::TYPE_ERASURE) {
8277       ss << "can not change the size of an erasure-coded pool";
8278       return -ENOTSUP;
8279     }
8280     if (interr.length()) {
8281       ss << "error parsing integer value '" << val << "': " << interr;
8282       return -EINVAL;
8283     }
8284     if (n <= 0 || n > 10) {
8285       ss << "pool size must be between 1 and 10";
8286       return -EINVAL;
8287     }
8288     if (n == 1) {
8289       if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8290         ss << "configuring pool size as 1 is disabled by default.";
8291         return -EPERM;
8292       }
8293       bool sure = false;
8294       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8295       if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8296         "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8297           "pass the flag --yes-i-really-mean-it.";
8298         return -EPERM;
8299       }
8300     }
8301     if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8302       ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8303       return -EINVAL;
8304     }
8305     int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8306     if (r < 0) {
8307       return r;
8308     }
8309     p.size = n;
8310     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8311   } else if (var == "min_size") {
8312     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8313       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8314       return -EPERM;
8315     }
8316     if (interr.length()) {
8317       ss << "error parsing integer value '" << val << "': " << interr;
8318       return -EINVAL;
8319     }
8320
8321     if (p.type != pg_pool_t::TYPE_ERASURE) {
8322       if (n < 1 || n > p.size) {
8323         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8324         return -EINVAL;
8325       }
8326     } else {
8327        ErasureCodeInterfaceRef erasure_code;
8328        int k;
8329        stringstream tmp;
8330        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8331        if (err == 0) {
8332          k = erasure_code->get_data_chunk_count();
8333        } else {
8334          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8335          return err;
8336        }
8337
8338        if (n < k || n > p.size) {
8339          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8340          return -EINVAL;
8341        }
8342     }
8343     p.min_size = n;
8344   } else if (var == "pg_num_actual") {
8345     if (interr.length()) {
8346       ss << "error parsing integer value '" << val << "': " << interr;
8347       return -EINVAL;
8348     }
8349     if (n == (int)p.get_pg_num()) {
8350       return 0;
8351     }
8352     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8353       ss << "'pg_num' must be greater than 0 and less than or equal to "
8354          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8355          << " (you may adjust 'mon max pool pg num' for higher values)";
8356       return -ERANGE;
8357     }
8358     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8359       ss << "cannot adjust pg_num while initial PGs are being created";
8360       return -EBUSY;
8361     }
8362     if (n > (int)p.get_pg_num()) {
8363       if (p.get_pg_num() != p.get_pg_num_pending()) {
8364         // force pre-nautilus clients to resend their ops, since they
8365         // don't understand pg_num_pending changes form a new interval
8366         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8367       }
8368       p.set_pg_num(n);
8369     } else {
8370       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8371         ss << "nautilus OSDs are required to adjust pg_num_pending";
8372         return -EPERM;
8373       }
8374       if (n < (int)p.get_pgp_num()) {
8375         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8376         return -EINVAL;
8377       }
8378       if (n < (int)p.get_pg_num() - 1) {
8379         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8380            << ") - 1; only single pg decrease is currently supported";
8381         return -EINVAL;
8382       }
8383       p.set_pg_num_pending(n);
8384       // force pre-nautilus clients to resend their ops, since they
8385       // don't understand pg_num_pending changes form a new interval
8386       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8387     }
8388     // force pre-luminous clients to resend their ops, since they
8389     // don't understand that split PGs now form a new interval.
8390     p.last_force_op_resend_preluminous = pending_inc.epoch;
8391   } else if (var == "pg_num") {
8392     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8393       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8394       return -EPERM;
8395     }
8396     if (interr.length()) {
8397       ss << "error parsing integer value '" << val << "': " << interr;
8398       return -EINVAL;
8399     }
8400     if (n == (int)p.get_pg_num_target()) {
8401       return 0;
8402     }
8403     if (n <= 0 || static_cast<uint64_t>(n) >
8404                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8405       ss << "'pg_num' must be greater than 0 and less than or equal to "
8406          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8407          << " (you may adjust 'mon max pool pg num' for higher values)";
8408       return -ERANGE;
8409     }
8410     if (n > (int)p.get_pg_num_target()) {
8411       int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8412       if (r) {
8413         return r;
8414       }
8415       bool force = false;
8416       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8417       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8418         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8419         return -EPERM;
8420       }
8421     } else {
8422       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8423         ss << "nautilus OSDs are required to decrease pg_num";
8424         return -EPERM;
8425       }
8426     }
8427     int64_t pg_min = 0, pg_max = 0;
8428     p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8429     p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8430     if (pg_min && n < pg_min) {
8431       ss << "specified pg_num " << n
8432          << " < pg_num_min " << pg_min;
8433       return -EINVAL;
8434     }
8435     if (pg_max && n > pg_max) {
8436       ss << "specified pg_num " << n
8437          << " < pg_num_max " << pg_max;
8438       return -EINVAL;
8439     }
8440     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8441       // pre-nautilus osdmap format; increase pg_num directly
8442       assert(n > (int)p.get_pg_num());
8443       // force pre-nautilus clients to resend their ops, since they
8444       // don't understand pg_num_target changes form a new interval
8445       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8446       // force pre-luminous clients to resend their ops, since they
8447       // don't understand that split PGs now form a new interval.
8448       p.last_force_op_resend_preluminous = pending_inc.epoch;
8449       p.set_pg_num(n);
8450     } else {
8451       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8452       // make pgp_num track pg_num if it already matches.  if it is set
8453       // differently, leave it different and let the user control it
8454       // manually.
8455       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8456         p.set_pgp_num_target(n);
8457       }
8458       p.set_pg_num_target(n);
8459     }
8460   } else if (var == "pgp_num_actual") {
8461     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8462       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8463       return -EPERM;
8464     }
8465     if (interr.length()) {
8466       ss << "error parsing integer value '" << val << "': " << interr;
8467       return -EINVAL;
8468     }
8469     if (n <= 0) {
8470       ss << "specified pgp_num must > 0, but you set to " << n;
8471       return -EINVAL;
8472     }
8473     if (n > (int)p.get_pg_num()) {
8474       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8475       return -EINVAL;
8476     }
8477     if (n > (int)p.get_pg_num_pending()) {
8478       ss << "specified pgp_num " << n
8479          << " > pg_num_pending " << p.get_pg_num_pending();
8480       return -EINVAL;
8481     }
8482     p.set_pgp_num(n);
8483   } else if (var == "pgp_num") {
8484     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8485       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8486       return -EPERM;
8487     }
8488     if (interr.length()) {
8489       ss << "error parsing integer value '" << val << "': " << interr;
8490       return -EINVAL;
8491     }
8492     if (n <= 0) {
8493       ss << "specified pgp_num must > 0, but you set to " << n;
8494       return -EINVAL;
8495     }
8496     if (n > (int)p.get_pg_num_target()) {
8497       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8498       return -EINVAL;
8499     }
8500     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8501       // pre-nautilus osdmap format; increase pgp_num directly
8502       p.set_pgp_num(n);
8503     } else {
8504       p.set_pgp_num_target(n);
8505     }
8506   } else if (var == "pg_autoscale_mode") {
8507     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8508     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8509       ss << "specified invalid mode " << val;
8510       return -EINVAL;
8511     }
8512     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8513       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8514       return -EINVAL;
8515     }
8516     p.pg_autoscale_mode = m;
8517   } else if (var == "crush_rule") {
8518     int id = osdmap.crush->get_rule_id(val);
8519     if (id == -ENOENT) {
8520       ss << "crush rule " << val << " does not exist";
8521       return -ENOENT;
8522     }
8523     if (id < 0) {
8524       ss << cpp_strerror(id);
8525       return -ENOENT;
8526     }
8527     if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8528       ss << "crush rule " << id << " type does not match pool";
8529       return -EINVAL;
8530     }
8531     p.crush_rule = id;
8532   } else if (var == "nodelete" || var == "nopgchange" ||
8533              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8534              var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8535     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8536     // make sure we only compare against 'n' if we didn't receive a string
8537     if (val == "true" || (interr.empty() && n == 1)) {
8538       p.set_flag(flag);
8539     } else if (val == "false" || (interr.empty() && n == 0)) {
8540       p.unset_flag(flag);
8541     } else {
8542       ss << "expecting value 'true', 'false', '0', or '1'";
8543       return -EINVAL;
8544     }
8545   } else if (var == "eio") {
8546     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8547
8548     // make sure we only compare against 'n' if we didn't receive a string
8549     if (val == "true" || (interr.empty() && n == 1)) {
8550       p.set_flag(flag);
8551     } else if (val == "false" || (interr.empty() && n == 0)) {
8552       p.unset_flag(flag);
8553     } else {
8554       ss << "expecting value 'true', 'false', '0', or '1'";
8555       return -EINVAL;
8556     }
8557   } else if (var == "hashpspool") {
8558     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8559     bool force = false;
8560     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8561
8562     if (!force) {
8563       ss << "are you SURE?  this will remap all placement groups in this pool,"
8564             " this triggers large data movement,"
8565             " pass --yes-i-really-mean-it if you really do.";
8566       return -EPERM;
8567     }
8568     // make sure we only compare against 'n' if we didn't receive a string
8569     if (val == "true" || (interr.empty() && n == 1)) {
8570       p.set_flag(flag);
8571     } else if (val == "false" || (interr.empty() && n == 0)) {
8572       p.unset_flag(flag);
8573     } else {
8574       ss << "expecting value 'true', 'false', '0', or '1'";
8575       return -EINVAL;
8576     }
8577   } else if (var == "hit_set_type") {
8578     if (val == "none")
8579       p.hit_set_params = HitSet::Params();
8580     else {
8581       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8582       if (err)
8583         return err;
8584       if (val == "bloom") {
8585         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8586         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8587         p.hit_set_params = HitSet::Params(bsp);
8588       } else if (val == "explicit_hash")
8589         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8590       else if (val == "explicit_object")
8591         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8592       else {
8593         ss << "unrecognized hit_set type '" << val << "'";
8594         return -EINVAL;
8595       }
8596     }
8597   } else if (var == "hit_set_period") {
8598     if (interr.length()) {
8599       ss << "error parsing integer value '" << val << "': " << interr;
8600       return -EINVAL;
8601     } else if (n < 0) {
8602       ss << "hit_set_period should be non-negative";
8603       return -EINVAL;
8604     }
8605     p.hit_set_period = n;
8606   } else if (var == "hit_set_count") {
8607     if (interr.length()) {
8608       ss << "error parsing integer value '" << val << "': " << interr;
8609       return -EINVAL;
8610     } else if (n < 0) {
8611       ss << "hit_set_count should be non-negative";
8612       return -EINVAL;
8613     }
8614     p.hit_set_count = n;
8615   } else if (var == "hit_set_fpp") {
8616     if (floaterr.length()) {
8617       ss << "error parsing floating point value '" << val << "': " << floaterr;
8618       return -EINVAL;
8619     } else if (f < 0 || f > 1.0) {
8620       ss << "hit_set_fpp should be in the range 0..1";
8621       return -EINVAL;
8622     }
8623     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8624       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8625       return -EINVAL;
8626     }
8627     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8628     bloomp->set_fpp(f);
8629   } else if (var == "use_gmt_hitset") {
8630     if (val == "true" || (interr.empty() && n == 1)) {
8631       p.use_gmt_hitset = true;
8632     } else {
8633       ss << "expecting value 'true' or '1'";
8634       return -EINVAL;
8635     }
8636   } else if (var == "allow_ec_overwrites") {
8637     if (!p.is_erasure()) {
8638       ss << "ec overwrites can only be enabled for an erasure coded pool";
8639       return -EINVAL;
8640     }
8641     stringstream err;
8642     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8643         !is_pool_currently_all_bluestore(pool, p, &err)) {
8644       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8645       return -EINVAL;
8646     }
8647     if (val == "true" || (interr.empty() && n == 1)) {
8648         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8649     } else if (val == "false" || (interr.empty() && n == 0)) {
8650       ss << "ec overwrites cannot be disabled once enabled";
8651       return -EINVAL;
8652     } else {
8653       ss << "expecting value 'true', 'false', '0', or '1'";
8654       return -EINVAL;
8655     }
8656   } else if (var == "target_max_objects") {
8657     if (interr.length()) {
8658       ss << "error parsing int '" << val << "': " << interr;
8659       return -EINVAL;
8660     }
8661     p.target_max_objects = n;
8662   } else if (var == "target_max_bytes") {
8663     if (interr.length()) {
8664       ss << "error parsing int '" << val << "': " << interr;
8665       return -EINVAL;
8666     }
8667     p.target_max_bytes = n;
8668   } else if (var == "cache_target_dirty_ratio") {
8669     if (floaterr.length()) {
8670       ss << "error parsing float '" << val << "': " << floaterr;
8671       return -EINVAL;
8672     }
8673     if (f < 0 || f > 1.0) {
8674       ss << "value must be in the range 0..1";
8675       return -ERANGE;
8676     }
8677     p.cache_target_dirty_ratio_micro = uf;
8678   } else if (var == "cache_target_dirty_high_ratio") {
8679     if (floaterr.length()) {
8680       ss << "error parsing float '" << val << "': " << floaterr;
8681       return -EINVAL;
8682     }
8683     if (f < 0 || f > 1.0) {
8684       ss << "value must be in the range 0..1";
8685       return -ERANGE;
8686     }
8687     p.cache_target_dirty_high_ratio_micro = uf;
8688   } else if (var == "cache_target_full_ratio") {
8689     if (floaterr.length()) {
8690       ss << "error parsing float '" << val << "': " << floaterr;
8691       return -EINVAL;
8692     }
8693     if (f < 0 || f > 1.0) {
8694       ss << "value must be in the range 0..1";
8695       return -ERANGE;
8696     }
8697     p.cache_target_full_ratio_micro = uf;
8698   } else if (var == "cache_min_flush_age") {
8699     if (interr.length()) {
8700       ss << "error parsing int '" << val << "': " << interr;
8701       return -EINVAL;
8702     }
8703     p.cache_min_flush_age = n;
8704   } else if (var == "cache_min_evict_age") {
8705     if (interr.length()) {
8706       ss << "error parsing int '" << val << "': " << interr;
8707       return -EINVAL;
8708     }
8709     p.cache_min_evict_age = n;
8710   } else if (var == "min_read_recency_for_promote") {
8711     if (interr.length()) {
8712       ss << "error parsing integer value '" << val << "': " << interr;
8713       return -EINVAL;
8714     }
8715     p.min_read_recency_for_promote = n;
8716   } else if (var == "hit_set_grade_decay_rate") {
8717     if (interr.length()) {
8718       ss << "error parsing integer value '" << val << "': " << interr;
8719       return -EINVAL;
8720     }
8721     if (n > 100 || n < 0) {
8722       ss << "value out of range,valid range is 0 - 100";
8723       return -EINVAL;
8724     }
8725     p.hit_set_grade_decay_rate = n;
8726   } else if (var == "hit_set_search_last_n") {
8727     if (interr.length()) {
8728       ss << "error parsing integer value '" << val << "': " << interr;
8729       return -EINVAL;
8730     }
8731     if (n > p.hit_set_count || n < 0) {
8732       ss << "value out of range,valid range is 0 - hit_set_count";
8733       return -EINVAL;
8734     }
8735     p.hit_set_search_last_n = n;
8736   } else if (var == "min_write_recency_for_promote") {
8737     if (interr.length()) {
8738       ss << "error parsing integer value '" << val << "': " << interr;
8739       return -EINVAL;
8740     }
8741     p.min_write_recency_for_promote = n;
8742   } else if (var == "fast_read") {
8743     if (p.is_replicated()) {
8744         ss << "fast read is not supported in replication pool";
8745         return -EINVAL;
8746     }
8747     if (val == "true" || (interr.empty() && n == 1)) {
8748       p.fast_read = true;
8749     } else if (val == "false" || (interr.empty() && n == 0)) {
8750       p.fast_read = false;
8751     } else {
8752       ss << "expecting value 'true', 'false', '0', or '1'";
8753       return -EINVAL;
8754     }
8755   } else if (pool_opts_t::is_opt_name(var)) {
8756     bool unset = val == "unset";
8757     if (var == "compression_mode") {
8758       if (!unset) {
8759         auto cmode = Compressor::get_comp_mode_type(val);
8760         if (!cmode) {
8761           ss << "unrecognized compression mode '" << val << "'";
8762           return -EINVAL;
8763         }
8764       }
8765     } else if (var == "compression_algorithm") {
8766       if (!unset) {
8767         auto alg = Compressor::get_comp_alg_type(val);
8768         if (!alg) {
8769           ss << "unrecognized compression_algorithm '" << val << "'";
8770           return -EINVAL;
8771         }
8772       }
8773     } else if (var == "compression_required_ratio") {
8774       if (floaterr.length()) {
8775         ss << "error parsing float value '" << val << "': " << floaterr;
8776         return -EINVAL;
8777       }
8778       if (f < 0 || f > 1) {
8779         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8780         return -EINVAL;
8781       }
8782     } else if (var == "csum_type") {
8783       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8784       if (t < 0 ) {
8785         ss << "unrecognized csum_type '" << val << "'";
8786         return -EINVAL;
8787       }
8788       //preserve csum_type numeric value
8789       n = t;
8790       interr.clear();
8791     } else if (var == "compression_max_blob_size" ||
8792                var == "compression_min_blob_size" ||
8793                var == "csum_max_block" ||
8794                var == "csum_min_block") {
8795       if (interr.length()) {
8796         ss << "error parsing int value '" << val << "': " << interr;
8797         return -EINVAL;
8798       }
8799     } else if (var == "fingerprint_algorithm") {
8800       if (!unset) {
8801         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8802         if (!alg) {
8803           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8804           return -EINVAL;
8805         }
8806       }
8807     } else if (var == "target_size_bytes") {
8808       if (interr.length()) {
8809         ss << "error parsing unit value '" << val << "': " << interr;
8810         return -EINVAL;
8811       }
8812       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8813         ss << "must set require_osd_release to nautilus or "
8814            << "later before setting target_size_bytes";
8815         return -EINVAL;
8816       }
8817     } else if (var == "pg_num_min") {
8818       if (interr.length()) {
8819         ss << "error parsing int value '" << val << "': " << interr;
8820         return -EINVAL;
8821       }
8822       if (n > (int)p.get_pg_num_target()) {
8823         ss << "specified pg_num_min " << n
8824            << " > pg_num " << p.get_pg_num_target();
8825         return -EINVAL;
8826       }
8827     } else if (var == "pg_num_max") {
8828       if (interr.length()) {
8829         ss << "error parsing int value '" << val << "': " << interr;
8830         return -EINVAL;
8831       }
8832       if (n && n < (int)p.get_pg_num_target()) {
8833         ss << "specified pg_num_max " << n
8834            << " < pg_num " << p.get_pg_num_target();
8835         return -EINVAL;
8836       }
8837     } else if (var == "recovery_priority") {
8838       if (interr.length()) {
8839         ss << "error parsing int value '" << val << "': " << interr;
8840         return -EINVAL;
8841       }
8842       if (!g_conf()->debug_allow_any_pool_priority) {
8843         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8844           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8845              << " and " << OSD_POOL_PRIORITY_MAX;
8846           return -EINVAL;
8847         }
8848       }
8849     } else if (var == "pg_autoscale_bias") {
8850       if (f < 0.0 || f > 1000.0) {
8851         ss << "pg_autoscale_bias must be between 0 and 1000";
8852         return -EINVAL;
8853       }
8854     } else if (var == "dedup_tier") {
8855       if (interr.empty()) {
8856         ss << "expecting value 'pool name'";
8857         return -EINVAL;
8858       }
8859       // Current base tier in dedup does not support ec pool
8860       if (p.is_erasure()) {
8861         ss << "pool '" << poolstr
8862            << "' is an ec pool, which cannot be a base tier";
8863         return -ENOTSUP;
8864       }
8865       int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8866       if (lowtierpool_id < 0) {
8867         ss << "unrecognized pool '" << val << "'";
8868         return -ENOENT;
8869       }
8870       const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8871       ceph_assert(tp);
8872       n = lowtierpool_id;
8873       // The original input is string (pool name), but we convert it to int64_t.
8874       // So, clear interr
8875       interr.clear();
8876     } else if (var == "dedup_chunk_algorithm") {
8877       if (!unset) {
8878         auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8879         if (!alg) {
8880           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8881           return -EINVAL;
8882         }
8883       }
8884     } else if (var == "dedup_cdc_chunk_size") {
8885       if (interr.length()) {
8886         ss << "error parsing int value '" << val << "': " << interr;
8887         return -EINVAL;
8888       }
8889     }
8890
8891     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8892     switch (desc.type) {
8893     case pool_opts_t::STR:
8894       if (unset) {
8895         p.opts.unset(desc.key);
8896       } else {
8897         p.opts.set(desc.key, static_cast<std::string>(val));
8898       }
8899       break;
8900     case pool_opts_t::INT:
8901       if (interr.length()) {
8902         ss << "error parsing integer value '" << val << "': " << interr;
8903         return -EINVAL;
8904       }
8905       if (n == 0) {
8906         p.opts.unset(desc.key);
8907       } else {
8908         p.opts.set(desc.key, static_cast<int64_t>(n));
8909       }
8910       break;
8911     case pool_opts_t::DOUBLE:
8912       if (floaterr.length()) {
8913         ss << "error parsing floating point value '" << val << "': " << floaterr;
8914         return -EINVAL;
8915       }
8916       if (f == 0) {
8917         p.opts.unset(desc.key);
8918       } else {
8919         p.opts.set(desc.key, static_cast<double>(f));
8920       }
8921       break;
8922     default:
8923       ceph_assert(!"unknown type");
8924     }
8925   } else {
8926     ss << "unrecognized variable '" << var << "'";
8927     return -EINVAL;
8928   }
8929   if (val != "unset") {
8930     ss << "set pool " << pool << " " << var << " to " << val;
8931   } else {
8932     ss << "unset pool " << pool << " " << var;
8933   }
8934   p.last_change = pending_inc.epoch;
8935   pending_inc.new_pools[pool] = p;
8936   return 0;
8937 }
8938
8939 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8940                                                  const cmdmap_t& cmdmap,
8941                                                  stringstream& ss)
8942 {
8943   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8944 }
8945
8946 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8947                                                     const cmdmap_t& cmdmap,
8948                                                     stringstream& ss,
8949                                                     bool *modified)
8950 {
8951   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8952 }
8953
8954
8955 /**
8956  * Common logic for preprocess and prepare phases of pool application
8957  * tag commands.  In preprocess mode we're only detecting invalid
8958  * commands, and determining whether it was a modification or a no-op.
8959  * In prepare mode we're actually updating the pending state.
8960  */
8961 int OSDMonitor::_command_pool_application(const string &prefix,
8962                                           const cmdmap_t& cmdmap,
8963                                           stringstream& ss,
8964                                           bool *modified,
8965                                           bool preparing)
8966 {
8967   string pool_name;
8968   cmd_getval(cmdmap, "pool", pool_name);
8969   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8970   if (pool < 0) {
8971     ss << "unrecognized pool '" << pool_name << "'";
8972     return -ENOENT;
8973   }
8974
8975   pg_pool_t p = *osdmap.get_pg_pool(pool);
8976   if (preparing) {
8977     if (pending_inc.new_pools.count(pool)) {
8978       p = pending_inc.new_pools[pool];
8979     }
8980   }
8981
8982   string app;
8983   cmd_getval(cmdmap, "app", app);
8984   bool app_exists = (p.application_metadata.count(app) > 0);
8985
8986   string key;
8987   cmd_getval(cmdmap, "key", key);
8988   if (key == "all") {
8989     ss << "key cannot be 'all'";
8990     return -EINVAL;
8991   }
8992
8993   string value;
8994   cmd_getval(cmdmap, "value", value);
8995   if (value == "all") {
8996     ss << "value cannot be 'all'";
8997     return -EINVAL;
8998   }
8999
9000   if (boost::algorithm::ends_with(prefix, "enable")) {
9001     if (app.empty()) {
9002       ss << "application name must be provided";
9003       return -EINVAL;
9004     }
9005
9006     if (p.is_tier()) {
9007       ss << "application must be enabled on base tier";
9008       return -EINVAL;
9009     }
9010
9011     bool force = false;
9012     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9013
9014     if (!app_exists && !p.application_metadata.empty() && !force) {
9015       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9016          << "application; pass --yes-i-really-mean-it to proceed anyway";
9017       return -EPERM;
9018     }
9019
9020     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9021       ss << "too many enabled applications on pool '" << pool_name << "'; "
9022          << "max " << MAX_POOL_APPLICATIONS;
9023       return -EINVAL;
9024     }
9025
9026     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9027       ss << "application name '" << app << "' too long; max length "
9028          << MAX_POOL_APPLICATION_LENGTH;
9029       return -EINVAL;
9030     }
9031
9032     if (!app_exists) {
9033       p.application_metadata[app] = {};
9034     }
9035     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9036
9037   } else if (boost::algorithm::ends_with(prefix, "disable")) {
9038     bool force = false;
9039     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9040
9041     if (!force) {
9042       ss << "Are you SURE? Disabling an application within a pool might result "
9043          << "in loss of application functionality; pass "
9044          << "--yes-i-really-mean-it to proceed anyway";
9045       return -EPERM;
9046     }
9047
9048     if (!app_exists) {
9049       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9050          << "'";
9051       return 0; // idempotent
9052     }
9053
9054     p.application_metadata.erase(app);
9055     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9056
9057   } else if (boost::algorithm::ends_with(prefix, "set")) {
9058     if (p.is_tier()) {
9059       ss << "application metadata must be set on base tier";
9060       return -EINVAL;
9061     }
9062
9063     if (!app_exists) {
9064       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9065          << "'";
9066       return -ENOENT;
9067     }
9068
9069     string key;
9070     cmd_getval(cmdmap, "key", key);
9071
9072     if (key.empty()) {
9073       ss << "key must be provided";
9074       return -EINVAL;
9075     }
9076
9077     auto &app_keys = p.application_metadata[app];
9078     if (app_keys.count(key) == 0 &&
9079         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9080       ss << "too many keys set for application '" << app << "' on pool '"
9081          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9082       return -EINVAL;
9083     }
9084
9085     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9086       ss << "key '" << app << "' too long; max length "
9087          << MAX_POOL_APPLICATION_LENGTH;
9088       return -EINVAL;
9089     }
9090
9091     string value;
9092     cmd_getval(cmdmap, "value", value);
9093     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9094       ss << "value '" << value << "' too long; max length "
9095          << MAX_POOL_APPLICATION_LENGTH;
9096       return -EINVAL;
9097     }
9098
9099     p.application_metadata[app][key] = value;
9100     ss << "set application '" << app << "' key '" << key << "' to '"
9101        << value << "' on pool '" << pool_name << "'";
9102   } else if (boost::algorithm::ends_with(prefix, "rm")) {
9103     if (!app_exists) {
9104       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9105          << "'";
9106       return -ENOENT;
9107     }
9108
9109     string key;
9110     cmd_getval(cmdmap, "key", key);
9111     auto it = p.application_metadata[app].find(key);
9112     if (it == p.application_metadata[app].end()) {
9113       ss << "application '" << app << "' on pool '" << pool_name
9114          << "' does not have key '" << key << "'";
9115       return 0; // idempotent
9116     }
9117
9118     p.application_metadata[app].erase(it);
9119     ss << "removed application '" << app << "' key '" << key << "' on pool '"
9120        << pool_name << "'";
9121   } else {
9122     ceph_abort();
9123   }
9124
9125   if (preparing) {
9126     p.last_change = pending_inc.epoch;
9127     pending_inc.new_pools[pool] = p;
9128   }
9129
9130   // Because we fell through this far, we didn't hit no-op cases,
9131   // so pool was definitely modified
9132   if (modified != nullptr) {
9133     *modified = true;
9134   }
9135
9136   return 0;
9137 }
9138
9139 int OSDMonitor::_prepare_command_osd_crush_remove(
9140     CrushWrapper &newcrush,
9141     int32_t id,
9142     int32_t ancestor,
9143     bool has_ancestor,
9144     bool unlink_only)
9145 {
9146   int err = 0;
9147
9148   if (has_ancestor) {
9149     err = newcrush.remove_item_under(cct, id, ancestor,
9150         unlink_only);
9151   } else {
9152     err = newcrush.remove_item(cct, id, unlink_only);
9153   }
9154   return err;
9155 }
9156
9157 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9158 {
9159   pending_inc.crush.clear();
9160   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9161 }
9162
9163 int OSDMonitor::prepare_command_osd_crush_remove(
9164     CrushWrapper &newcrush,
9165     int32_t id,
9166     int32_t ancestor,
9167     bool has_ancestor,
9168     bool unlink_only)
9169 {
9170   int err = _prepare_command_osd_crush_remove(
9171       newcrush, id, ancestor,
9172       has_ancestor, unlink_only);
9173
9174   if (err < 0)
9175     return err;
9176
9177   ceph_assert(err == 0);
9178   do_osd_crush_remove(newcrush);
9179
9180   return 0;
9181 }
9182
9183 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9184 {
9185   if (osdmap.is_up(id)) {
9186     return -EBUSY;
9187   }
9188
9189   pending_inc.new_state[id] = osdmap.get_state(id);
9190   pending_inc.new_uuid[id] = uuid_d();
9191   pending_metadata_rm.insert(id);
9192   pending_metadata.erase(id);
9193
9194   return 0;
9195 }
9196
9197 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9198 {
9199   ceph_assert(existing_id);
9200   *existing_id = -1;
9201
9202   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9203     if (!osdmap.exists(i) &&
9204         pending_inc.new_up_client.count(i) == 0 &&
9205         (pending_inc.new_state.count(i) == 0 ||
9206          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9207       *existing_id = i;
9208       return -1;
9209     }
9210   }
9211
9212   if (pending_inc.new_max_osd < 0) {
9213     return osdmap.get_max_osd();
9214   }
9215   return pending_inc.new_max_osd;
9216 }
9217
9218 void OSDMonitor::do_osd_create(
9219     const int32_t id,
9220     const uuid_d& uuid,
9221     const string& device_class,
9222     int32_t* new_id)
9223 {
9224   dout(10) << __func__ << " uuid " << uuid << dendl;
9225   ceph_assert(new_id);
9226
9227   // We presume validation has been performed prior to calling this
9228   // function. We assert with prejudice.
9229
9230   int32_t allocated_id = -1; // declare here so we can jump
9231   int32_t existing_id = -1;
9232   if (!uuid.is_zero()) {
9233     existing_id = osdmap.identify_osd(uuid);
9234     if (existing_id >= 0) {
9235       ceph_assert(id < 0 || id == existing_id);
9236       *new_id = existing_id;
9237       goto out;
9238     } else if (id >= 0) {
9239       // uuid does not exist, and id has been provided, so just create
9240       // the new osd.id
9241       *new_id = id;
9242       goto out;
9243     }
9244   }
9245
9246   // allocate a new id
9247   allocated_id = _allocate_osd_id(&existing_id);
9248   dout(10) << __func__ << " allocated id " << allocated_id
9249            << " existing id " << existing_id << dendl;
9250   if (existing_id >= 0) {
9251     ceph_assert(existing_id < osdmap.get_max_osd());
9252     ceph_assert(allocated_id < 0);
9253     *new_id = existing_id;
9254   } else if (allocated_id >= 0) {
9255     ceph_assert(existing_id < 0);
9256     // raise max_osd
9257     if (pending_inc.new_max_osd < 0) {
9258       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9259     } else {
9260       ++pending_inc.new_max_osd;
9261     }
9262     *new_id = pending_inc.new_max_osd - 1;
9263     ceph_assert(*new_id == allocated_id);
9264   } else {
9265     ceph_abort_msg("unexpected condition");
9266   }
9267
9268 out:
9269   if (device_class.size()) {
9270     CrushWrapper newcrush = _get_pending_crush();
9271     if (newcrush.get_max_devices() < *new_id + 1) {
9272       newcrush.set_max_devices(*new_id + 1);
9273     }
9274     string name = string("osd.") + stringify(*new_id);
9275     if (!newcrush.item_exists(*new_id)) {
9276       newcrush.set_item_name(*new_id, name);
9277     }
9278     ostringstream ss;
9279     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9280     if (r < 0) {
9281       derr << __func__ << " failed to set " << name << " device_class "
9282            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9283            << dendl;
9284       // non-fatal... this might be a replay and we want to be idempotent.
9285     } else {
9286       dout(20) << __func__ << " set " << name << " device_class " << device_class
9287                << dendl;
9288       pending_inc.crush.clear();
9289       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9290     }
9291   } else {
9292     dout(20) << __func__ << " no device_class" << dendl;
9293   }
9294
9295   dout(10) << __func__ << " using id " << *new_id << dendl;
9296   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9297     pending_inc.new_max_osd = *new_id + 1;
9298   }
9299
9300   pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9301   // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9302   // set it for us.  (ugh.)
9303   pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9304   if (!uuid.is_zero())
9305     pending_inc.new_uuid[*new_id] = uuid;
9306 }
9307
9308 int OSDMonitor::validate_osd_create(
9309     const int32_t id,
9310     const uuid_d& uuid,
9311     const bool check_osd_exists,
9312     int32_t* existing_id,
9313     stringstream& ss)
9314 {
9315
9316   dout(10) << __func__ << " id " << id << " uuid " << uuid
9317            << " check_osd_exists " << check_osd_exists << dendl;
9318
9319   ceph_assert(existing_id);
9320
9321   if (id < 0 && uuid.is_zero()) {
9322     // we have nothing to validate
9323     *existing_id = -1;
9324     return 0;
9325   } else if (uuid.is_zero()) {
9326     // we have an id but we will ignore it - because that's what
9327     // `osd create` does.
9328     return 0;
9329   }
9330
9331   /*
9332    * This function will be used to validate whether we are able to
9333    * create a new osd when the `uuid` is specified.
9334    *
9335    * It will be used by both `osd create` and `osd new`, as the checks
9336    * are basically the same when it pertains to osd id and uuid validation.
9337    * However, `osd create` presumes an `uuid` is optional, for legacy
9338    * reasons, while `osd new` requires the `uuid` to be provided. This
9339    * means that `osd create` will not be idempotent if an `uuid` is not
9340    * provided, but we will always guarantee the idempotency of `osd new`.
9341    */
9342
9343   ceph_assert(!uuid.is_zero());
9344   if (pending_inc.identify_osd(uuid) >= 0) {
9345     // osd is about to exist
9346     return -EAGAIN;
9347   }
9348
9349   int32_t i = osdmap.identify_osd(uuid);
9350   if (i >= 0) {
9351     // osd already exists
9352     if (id >= 0 && i != id) {
9353       ss << "uuid " << uuid << " already in use for different id " << i;
9354       return -EEXIST;
9355     }
9356     // return a positive errno to distinguish between a blocking error
9357     // and an error we consider to not be a problem (i.e., this would be
9358     // an idempotent operation).
9359     *existing_id = i;
9360     return EEXIST;
9361   }
9362   // i < 0
9363   if (id >= 0) {
9364     if (pending_inc.new_state.count(id)) {
9365       // osd is about to exist
9366       return -EAGAIN;
9367     }
9368     // we may not care if an osd exists if we are recreating a previously
9369     // destroyed osd.
9370     if (check_osd_exists && osdmap.exists(id)) {
9371       ss << "id " << id << " already in use and does not match uuid "
9372          << uuid;
9373       return -EINVAL;
9374     }
9375   }
9376   return 0;
9377 }
9378
9379 int OSDMonitor::prepare_command_osd_create(
9380     const int32_t id,
9381     const uuid_d& uuid,
9382     int32_t* existing_id,
9383     stringstream& ss)
9384 {
9385   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9386   ceph_assert(existing_id);
9387   if (osdmap.is_destroyed(id)) {
9388     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9389           "instead.";
9390     return -EINVAL;
9391   }
9392
9393   if (uuid.is_zero()) {
9394     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9395   }
9396
9397   return validate_osd_create(id, uuid, true, existing_id, ss);
9398 }
9399
9400 int OSDMonitor::prepare_command_osd_new(
9401     MonOpRequestRef op,
9402     const cmdmap_t& cmdmap,
9403     const map<string,string>& params,
9404     stringstream &ss,
9405     Formatter *f)
9406 {
9407   uuid_d uuid;
9408   string uuidstr;
9409   int64_t id = -1;
9410
9411   ceph_assert(paxos.is_plugged());
9412
9413   dout(10) << __func__ << " " << op << dendl;
9414
9415   /* validate command. abort now if something's wrong. */
9416
9417   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9418    *
9419    * If `id` is not specified, we will identify any existing osd based
9420    * on `uuid`. Operation will be idempotent iff secrets match.
9421    *
9422    * If `id` is specified, we will identify any existing osd based on
9423    * `uuid` and match against `id`. If they match, operation will be
9424    * idempotent iff secrets match.
9425    *
9426    * `-i secrets.json` will be optional. If supplied, will be used
9427    * to check for idempotency when `id` and `uuid` match.
9428    *
9429    * If `id` is not specified, and `uuid` does not exist, an id will
9430    * be found or allocated for the osd.
9431    *
9432    * If `id` is specified, and the osd has been previously marked
9433    * as destroyed, then the `id` will be reused.
9434    */
9435   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9436     ss << "requires the OSD's UUID to be specified.";
9437     return -EINVAL;
9438   } else if (!uuid.parse(uuidstr.c_str())) {
9439     ss << "invalid UUID value '" << uuidstr << "'.";
9440     return -EINVAL;
9441   }
9442
9443   if (cmd_getval(cmdmap, "id", id) &&
9444       (id < 0)) {
9445     ss << "invalid OSD id; must be greater or equal than zero.";
9446     return -EINVAL;
9447   }
9448
9449   // are we running an `osd create`-like command, or recreating
9450   // a previously destroyed osd?
9451
9452   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9453
9454   // we will care about `id` to assess whether osd is `destroyed`, or
9455   // to create a new osd.
9456   // we will need an `id` by the time we reach auth.
9457
9458   int32_t existing_id = -1;
9459   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9460                                 &existing_id, ss);
9461
9462   bool may_be_idempotent = false;
9463   if (err == EEXIST) {
9464     // this is idempotent from the osdmon's point-of-view
9465     may_be_idempotent = true;
9466     ceph_assert(existing_id >= 0);
9467     id = existing_id;
9468   } else if (err < 0) {
9469     return err;
9470   }
9471
9472   if (!may_be_idempotent) {
9473     // idempotency is out of the window. We are either creating a new
9474     // osd or recreating a destroyed osd.
9475     //
9476     // We now need to figure out if we have an `id` (and if it's valid),
9477     // of find an `id` if we don't have one.
9478
9479     // NOTE: we need to consider the case where the `id` is specified for
9480     // `osd create`, and we must honor it. So this means checking if
9481     // the `id` is destroyed, and if so assume the destroy; otherwise,
9482     // check if it `exists` - in which case we complain about not being
9483     // `destroyed`. In the end, if nothing fails, we must allow the
9484     // creation, so that we are compatible with `create`.
9485     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9486       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9487       ss << "OSD " << id << " has not yet been destroyed";
9488       return -EINVAL;
9489     } else if (id < 0) {
9490       // find an `id`
9491       id = _allocate_osd_id(&existing_id);
9492       if (id < 0) {
9493         ceph_assert(existing_id >= 0);
9494         id = existing_id;
9495       }
9496       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9497     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9498       dout(10) << __func__ << " recreating osd." << id << dendl;
9499     } else {
9500       dout(10) << __func__ << " creating new osd." << id << dendl;
9501     }
9502   } else {
9503     ceph_assert(id >= 0);
9504     ceph_assert(osdmap.exists(id));
9505   }
9506
9507   // we are now able to either create a brand new osd or reuse an existing
9508   // osd that has been previously destroyed.
9509
9510   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9511
9512   if (may_be_idempotent && params.empty()) {
9513     // nothing to do, really.
9514     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9515     ceph_assert(id >= 0);
9516     if (f) {
9517       f->open_object_section("created_osd");
9518       f->dump_int("osdid", id);
9519       f->close_section();
9520     } else {
9521       ss << id;
9522     }
9523     return EEXIST;
9524   }
9525
9526   string device_class;
9527   auto p = params.find("crush_device_class");
9528   if (p != params.end()) {
9529     device_class = p->second;
9530     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9531   }
9532   string cephx_secret, lockbox_secret, dmcrypt_key;
9533   bool has_lockbox = false;
9534   bool has_secrets = params.count("cephx_secret")
9535     || params.count("cephx_lockbox_secret")
9536     || params.count("dmcrypt_key");
9537
9538   KVMonitor *svc = nullptr;
9539   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9540
9541   if (has_secrets) {
9542     if (params.count("cephx_secret") == 0) {
9543       ss << "requires a cephx secret.";
9544       return -EINVAL;
9545     }
9546     cephx_secret = params.at("cephx_secret");
9547
9548     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9549     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9550
9551     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9552              << " dmcrypt " << has_dmcrypt_key << dendl;
9553
9554     if (has_lockbox_secret && has_dmcrypt_key) {
9555       has_lockbox = true;
9556       lockbox_secret = params.at("cephx_lockbox_secret");
9557       dmcrypt_key = params.at("dmcrypt_key");
9558     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9559       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9560       return -EINVAL;
9561     }
9562
9563     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9564
9565     err = mon.authmon()->validate_osd_new(id, uuid,
9566         cephx_secret,
9567         lockbox_secret,
9568         cephx_entity,
9569         lockbox_entity,
9570         ss);
9571     if (err < 0) {
9572       return err;
9573     } else if (may_be_idempotent && err != EEXIST) {
9574       // for this to be idempotent, `id` should already be >= 0; no need
9575       // to use validate_id.
9576       ceph_assert(id >= 0);
9577       ss << "osd." << id << " exists but secrets do not match";
9578       return -EEXIST;
9579     }
9580
9581     if (has_lockbox) {
9582       svc = mon.kvmon();
9583       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9584       if (err < 0) {
9585         return err;
9586       } else if (may_be_idempotent && err != EEXIST) {
9587         ceph_assert(id >= 0);
9588         ss << "osd." << id << " exists but dm-crypt key does not match.";
9589         return -EEXIST;
9590       }
9591     }
9592   }
9593   ceph_assert(!has_secrets || !cephx_secret.empty());
9594   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9595
9596   if (may_be_idempotent) {
9597     // we have nothing to do for either the osdmon or the authmon,
9598     // and we have no lockbox - so the config key service will not be
9599     // touched. This is therefore an idempotent operation, and we can
9600     // just return right away.
9601     dout(10) << __func__ << " idempotent -- no op." << dendl;
9602     ceph_assert(id >= 0);
9603     if (f) {
9604       f->open_object_section("created_osd");
9605       f->dump_int("osdid", id);
9606       f->close_section();
9607     } else {
9608       ss << id;
9609     }
9610     return EEXIST;
9611   }
9612   ceph_assert(!may_be_idempotent);
9613
9614   // perform updates.
9615   if (has_secrets) {
9616     ceph_assert(!cephx_secret.empty());
9617     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9618            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9619
9620     err = mon.authmon()->do_osd_new(cephx_entity,
9621         lockbox_entity,
9622         has_lockbox);
9623     ceph_assert(0 == err);
9624
9625     if (has_lockbox) {
9626       ceph_assert(nullptr != svc);
9627       svc->do_osd_new(uuid, dmcrypt_key);
9628     }
9629   }
9630
9631   if (is_recreate_destroyed) {
9632     ceph_assert(id >= 0);
9633     ceph_assert(osdmap.is_destroyed(id));
9634     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9635     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9636       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9637     }
9638     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9639       // due to http://tracker.ceph.com/issues/20751 some clusters may
9640       // have UP set for non-existent OSDs; make sure it is cleared
9641       // for a newly created osd.
9642       pending_inc.new_state[id] |= CEPH_OSD_UP;
9643     }
9644     pending_inc.new_uuid[id] = uuid;
9645   } else {
9646     ceph_assert(id >= 0);
9647     int32_t new_id = -1;
9648     do_osd_create(id, uuid, device_class, &new_id);
9649     ceph_assert(new_id >= 0);
9650     ceph_assert(id == new_id);
9651   }
9652
9653   if (f) {
9654     f->open_object_section("created_osd");
9655     f->dump_int("osdid", id);
9656     f->close_section();
9657   } else {
9658     ss << id;
9659   }
9660
9661   return 0;
9662 }
9663
9664 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9665 {
9666   op->mark_osdmon_event(__func__);
9667   auto m = op->get_req<MMonCommand>();
9668   stringstream ss;
9669   cmdmap_t cmdmap;
9670   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9671     string rs = ss.str();
9672     mon.reply_command(op, -EINVAL, rs, get_last_committed());
9673     return true;
9674   }
9675
9676   MonSession *session = op->get_session();
9677   if (!session) {
9678     derr << __func__ << " no session" << dendl;
9679     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9680     return true;
9681   }
9682
9683   return prepare_command_impl(op, cmdmap);
9684 }
9685
9686 static int parse_reweights(CephContext *cct,
9687                            const cmdmap_t& cmdmap,
9688                            const OSDMap& osdmap,
9689                            map<int32_t, uint32_t>* weights)
9690 {
9691   string weights_str;
9692   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9693     return -EINVAL;
9694   }
9695   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9696   json_spirit::mValue json_value;
9697   if (!json_spirit::read(weights_str, json_value)) {
9698     return -EINVAL;
9699   }
9700   if (json_value.type() != json_spirit::obj_type) {
9701     return -EINVAL;
9702   }
9703   const auto obj = json_value.get_obj();
9704   try {
9705     for (auto& osd_weight : obj) {
9706       auto osd_id = std::stoi(osd_weight.first);
9707       if (!osdmap.exists(osd_id)) {
9708         return -ENOENT;
9709       }
9710       if (osd_weight.second.type() != json_spirit::str_type) {
9711         return -EINVAL;
9712       }
9713       auto weight = std::stoul(osd_weight.second.get_str());
9714       weights->insert({osd_id, weight});
9715     }
9716   } catch (const std::logic_error& e) {
9717     return -EINVAL;
9718   }
9719   return 0;
9720 }
9721
9722 int OSDMonitor::prepare_command_osd_destroy(
9723     int32_t id,
9724     stringstream& ss)
9725 {
9726   ceph_assert(paxos.is_plugged());
9727
9728   // we check if the osd exists for the benefit of `osd purge`, which may
9729   // have previously removed the osd. If the osd does not exist, return
9730   // -ENOENT to convey this, and let the caller deal with it.
9731   //
9732   // we presume that all auth secrets and config keys were removed prior
9733   // to this command being called. if they exist by now, we also assume
9734   // they must have been created by some other command and do not pertain
9735   // to this non-existent osd.
9736   if (!osdmap.exists(id)) {
9737     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9738     return -ENOENT;
9739   }
9740
9741   uuid_d uuid = osdmap.get_uuid(id);
9742   dout(10) << __func__ << " destroying osd." << id
9743            << " uuid " << uuid << dendl;
9744
9745   // if it has been destroyed, we assume our work here is done.
9746   if (osdmap.is_destroyed(id)) {
9747     ss << "destroyed osd." << id;
9748     return 0;
9749   }
9750
9751   EntityName cephx_entity, lockbox_entity;
9752   bool idempotent_auth = false, idempotent_cks = false;
9753
9754   int err = mon.authmon()->validate_osd_destroy(id, uuid,
9755                                                  cephx_entity,
9756                                                  lockbox_entity,
9757                                                  ss);
9758   if (err < 0) {
9759     if (err == -ENOENT) {
9760       idempotent_auth = true;
9761     } else {
9762       return err;
9763     }
9764   }
9765
9766   auto svc = mon.kvmon();
9767   err = svc->validate_osd_destroy(id, uuid);
9768   if (err < 0) {
9769     ceph_assert(err == -ENOENT);
9770     err = 0;
9771     idempotent_cks = true;
9772   }
9773
9774   if (!idempotent_auth) {
9775     err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9776     ceph_assert(0 == err);
9777   }
9778
9779   if (!idempotent_cks) {
9780     svc->do_osd_destroy(id, uuid);
9781   }
9782
9783   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9784   pending_inc.new_uuid[id] = uuid_d();
9785
9786   // we can only propose_pending() once per service, otherwise we'll be
9787   // defying PaxosService and all laws of nature. Therefore, as we may
9788   // be used during 'osd purge', let's keep the caller responsible for
9789   // proposing.
9790   ceph_assert(err == 0);
9791   return 0;
9792 }
9793
9794 int OSDMonitor::prepare_command_osd_purge(
9795     int32_t id,
9796     stringstream& ss)
9797 {
9798   ceph_assert(paxos.is_plugged());
9799   dout(10) << __func__ << " purging osd." << id << dendl;
9800
9801   ceph_assert(!osdmap.is_up(id));
9802
9803   /*
9804    * This may look a bit weird, but this is what's going to happen:
9805    *
9806    *  1. we make sure that removing from crush works
9807    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9808    *     error, then we abort the whole operation, as no updates
9809    *     have been made. However, we this function will have
9810    *     side-effects, thus we need to make sure that all operations
9811    *     performed henceforth will *always* succeed.
9812    *  3. we call `prepare_command_osd_remove()`. Although this
9813    *     function can return an error, it currently only checks if the
9814    *     osd is up - and we have made sure that it is not so, so there
9815    *     is no conflict, and it is effectively an update.
9816    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9817    *     the crush update we delayed from before.
9818    */
9819
9820   CrushWrapper newcrush = _get_pending_crush();
9821
9822   bool may_be_idempotent = false;
9823
9824   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9825   if (err == -ENOENT) {
9826     err = 0;
9827     may_be_idempotent = true;
9828   } else if (err < 0) {
9829     ss << "error removing osd." << id << " from crush";
9830     return err;
9831   }
9832
9833   // no point destroying the osd again if it has already been marked destroyed
9834   if (!osdmap.is_destroyed(id)) {
9835     err = prepare_command_osd_destroy(id, ss);
9836     if (err < 0) {
9837       if (err == -ENOENT) {
9838         err = 0;
9839       } else {
9840         return err;
9841       }
9842     } else {
9843       may_be_idempotent = false;
9844     }
9845   }
9846   ceph_assert(0 == err);
9847
9848   if (may_be_idempotent && !osdmap.exists(id)) {
9849     dout(10) << __func__ << " osd." << id << " does not exist and "
9850              << "we are idempotent." << dendl;
9851     return -ENOENT;
9852   }
9853
9854   err = prepare_command_osd_remove(id);
9855   // we should not be busy, as we should have made sure this id is not up.
9856   ceph_assert(0 == err);
9857
9858   do_osd_crush_remove(newcrush);
9859   return 0;
9860 }
9861
9862 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9863                                       const cmdmap_t& cmdmap)
9864 {
9865   op->mark_osdmon_event(__func__);
9866   auto m = op->get_req<MMonCommand>();
9867   bool ret = false;
9868   stringstream ss;
9869   string rs;
9870   bufferlist rdata;
9871   int err = 0;
9872
9873   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
9874   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9875
9876   string prefix;
9877   cmd_getval(cmdmap, "prefix", prefix);
9878
9879   int64_t osdid;
9880   string osd_name;
9881   bool osdid_present = false;
9882   if (prefix != "osd pg-temp" &&
9883       prefix != "osd pg-upmap" &&
9884       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9885     osdid_present = cmd_getval(cmdmap, "id", osdid);
9886   }
9887   if (osdid_present) {
9888     ostringstream oss;
9889     oss << "osd." << osdid;
9890     osd_name = oss.str();
9891   }
9892
9893   // Even if there's a pending state with changes that could affect
9894   // a command, considering that said state isn't yet committed, we
9895   // just don't care about those changes if the command currently being
9896   // handled acts as a no-op against the current committed state.
9897   // In a nutshell, we assume this command  happens *before*.
9898   //
9899   // Let me make this clearer:
9900   //
9901   //   - If we have only one client, and that client issues some
9902   //     operation that would conflict with this operation  but is
9903   //     still on the pending state, then we would be sure that said
9904   //     operation wouldn't have returned yet, so the client wouldn't
9905   //     issue this operation (unless the client didn't wait for the
9906   //     operation to finish, and that would be the client's own fault).
9907   //
9908   //   - If we have more than one client, each client will observe
9909   //     whatever is the state at the moment of the commit.  So, if we
9910   //     have two clients, one issuing an unlink and another issuing a
9911   //     link, and if the link happens while the unlink is still on the
9912   //     pending state, from the link's point-of-view this is a no-op.
9913   //     If different clients are issuing conflicting operations and
9914   //     they care about that, then the clients should make sure they
9915   //     enforce some kind of concurrency mechanism -- from our
9916   //     perspective that's what Douglas Adams would call an SEP.
9917   //
9918   // This should be used as a general guideline for most commands handled
9919   // in this function.  Adapt as you see fit, but please bear in mind that
9920   // this is the expected behavior.
9921
9922
9923   if (prefix == "osd setcrushmap" ||
9924       (prefix == "osd crush set" && !osdid_present)) {
9925     if (pending_inc.crush.length()) {
9926       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9927       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9928       return true;
9929     }
9930     dout(10) << "prepare_command setting new crush map" << dendl;
9931     bufferlist data(m->get_data());
9932     CrushWrapper crush;
9933     try {
9934       auto bl = data.cbegin();
9935       crush.decode(bl);
9936     }
9937     catch (const std::exception &e) {
9938       err = -EINVAL;
9939       ss << "Failed to parse crushmap: " << e.what();
9940       goto reply;
9941     }
9942
9943     int64_t prior_version = 0;
9944     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9945       if (prior_version == osdmap.get_crush_version() - 1) {
9946         // see if we are a resend of the last update.  this is imperfect
9947         // (multiple racing updaters may not both get reliable success)
9948         // but we expect crush updaters (via this interface) to be rare-ish.
9949         bufferlist current, proposed;
9950         osdmap.crush->encode(current, mon.get_quorum_con_features());
9951         crush.encode(proposed, mon.get_quorum_con_features());
9952         if (current.contents_equal(proposed)) {
9953           dout(10) << __func__
9954                    << " proposed matches current and version equals previous"
9955                    << dendl;
9956           err = 0;
9957           ss << osdmap.get_crush_version();
9958           goto reply;
9959         }
9960       }
9961       if (prior_version != osdmap.get_crush_version()) {
9962         err = -EPERM;
9963         ss << "prior_version " << prior_version << " != crush version "
9964            << osdmap.get_crush_version();
9965         goto reply;
9966       }
9967     }
9968
9969     if (!validate_crush_against_features(&crush, ss)) {
9970       err = -EINVAL;
9971       goto reply;
9972     }
9973
9974     err = osdmap.validate_crush_rules(&crush, &ss);
9975     if (err < 0) {
9976       goto reply;
9977     }
9978
9979     if (g_conf()->mon_osd_crush_smoke_test) {
9980       // sanity check: test some inputs to make sure this map isn't
9981       // totally broken
9982       dout(10) << " testing map" << dendl;
9983       stringstream ess;
9984       CrushTester tester(crush, ess);
9985       tester.set_min_x(0);
9986       tester.set_max_x(50);
9987       tester.set_num_rep(3);  // arbitrary
9988       auto start = ceph::coarse_mono_clock::now();
9989       int r = tester.test_with_fork(g_conf()->mon_lease);
9990       auto duration = ceph::coarse_mono_clock::now() - start;
9991       if (r < 0) {
9992         dout(10) << " tester.test_with_fork returns " << r
9993                  << ": " << ess.str() << dendl;
9994         ss << "crush smoke test failed with " << r << ": " << ess.str();
9995         err = r;
9996         goto reply;
9997       }
9998       dout(10) << __func__ << " crush somke test duration: "
9999                << duration << ", result: " << ess.str() << dendl;
10000     }
10001
10002     pending_inc.crush = data;
10003     ss << osdmap.get_crush_version() + 1;
10004     goto update;
10005
10006   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10007     CrushWrapper newcrush = _get_pending_crush();
10008     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10009       int bid = -1 - b;
10010       if (newcrush.bucket_exists(bid) &&
10011           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10012         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10013         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10014       }
10015     }
10016     if (!validate_crush_against_features(&newcrush, ss)) {
10017       err = -EINVAL;
10018       goto reply;
10019     }
10020     pending_inc.crush.clear();
10021     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10022     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10023                                               get_last_committed() + 1));
10024     return true;
10025   } else if (prefix == "osd crush set-device-class") {
10026     string device_class;
10027     if (!cmd_getval(cmdmap, "class", device_class)) {
10028       err = -EINVAL; // no value!
10029       goto reply;
10030     }
10031
10032     bool stop = false;
10033     vector<string> idvec;
10034     cmd_getval(cmdmap, "ids", idvec);
10035     CrushWrapper newcrush = _get_pending_crush();
10036     set<int> updated;
10037     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10038       set<int> osds;
10039       // wildcard?
10040       if (j == 0 &&
10041           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10042         osdmap.get_all_osds(osds);
10043         stop = true;
10044       } else {
10045         // try traditional single osd way
10046         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10047         if (osd < 0) {
10048           // ss has reason for failure
10049           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10050           err = -EINVAL;
10051           continue;
10052         }
10053         osds.insert(osd);
10054       }
10055
10056       for (auto &osd : osds) {
10057         if (!osdmap.exists(osd)) {
10058           ss << "osd." << osd << " does not exist. ";
10059           continue;
10060         }
10061
10062         ostringstream oss;
10063         oss << "osd." << osd;
10064         string name = oss.str();
10065
10066         if (newcrush.get_max_devices() < osd + 1) {
10067           newcrush.set_max_devices(osd + 1);
10068         }
10069         string action;
10070         if (newcrush.item_exists(osd)) {
10071           action = "updating";
10072         } else {
10073           action = "creating";
10074           newcrush.set_item_name(osd, name);
10075         }
10076
10077         dout(5) << action << " crush item id " << osd << " name '" << name
10078                 << "' device_class '" << device_class << "'"
10079                 << dendl;
10080         err = newcrush.update_device_class(osd, device_class, name, &ss);
10081         if (err < 0) {
10082           goto reply;
10083         }
10084         if (err == 0 && !_have_pending_crush()) {
10085           if (!stop) {
10086             // for single osd only, wildcard makes too much noise
10087             ss << "set-device-class item id " << osd << " name '" << name
10088                << "' device_class '" << device_class << "': no change. ";
10089           }
10090         } else {
10091           updated.insert(osd);
10092         }
10093       }
10094     }
10095
10096     pending_inc.crush.clear();
10097     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10098     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10099     getline(ss, rs);
10100     wait_for_finished_proposal(
10101       op,
10102       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10103     return true;
10104  } else if (prefix == "osd crush rm-device-class") {
10105     bool stop = false;
10106     vector<string> idvec;
10107     cmd_getval(cmdmap, "ids", idvec);
10108     CrushWrapper newcrush = _get_pending_crush();
10109     set<int> updated;
10110
10111     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10112       set<int> osds;
10113
10114       // wildcard?
10115       if (j == 0 &&
10116           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10117         osdmap.get_all_osds(osds);
10118         stop = true;
10119       } else {
10120         // try traditional single osd way
10121         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10122         if (osd < 0) {
10123           // ss has reason for failure
10124           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10125           err = -EINVAL;
10126           goto reply;
10127         }
10128         osds.insert(osd);
10129       }
10130
10131       for (auto &osd : osds) {
10132         if (!osdmap.exists(osd)) {
10133           ss << "osd." << osd << " does not exist. ";
10134           continue;
10135         }
10136
10137         auto class_name = newcrush.get_item_class(osd);
10138         if (!class_name) {
10139           ss << "osd." << osd << " belongs to no class, ";
10140           continue;
10141         }
10142         // note that we do not verify if class_is_in_use here
10143         // in case the device is misclassified and user wants
10144         // to overridely reset...
10145
10146         err = newcrush.remove_device_class(cct, osd, &ss);
10147         if (err < 0) {
10148           // ss has reason for failure
10149           goto reply;
10150         }
10151         updated.insert(osd);
10152       }
10153     }
10154
10155     pending_inc.crush.clear();
10156     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10157     ss << "done removing class of osd(s): " << updated;
10158     getline(ss, rs);
10159     wait_for_finished_proposal(
10160       op,
10161       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10162     return true;
10163   } else if (prefix == "osd crush class create") {
10164     string device_class;
10165     if (!cmd_getval(cmdmap, "class", device_class)) {
10166       err = -EINVAL; // no value!
10167       goto reply;
10168     }
10169     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10170       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10171          << "luminous' before using crush device classes";
10172       err = -EPERM;
10173       goto reply;
10174     }
10175     if (!_have_pending_crush() &&
10176         _get_stable_crush().class_exists(device_class)) {
10177       ss << "class '" << device_class << "' already exists";
10178       goto reply;
10179     }
10180      CrushWrapper newcrush = _get_pending_crush();
10181      if (newcrush.class_exists(device_class)) {
10182       ss << "class '" << device_class << "' already exists";
10183       goto update;
10184     }
10185     int class_id = newcrush.get_or_create_class_id(device_class);
10186     pending_inc.crush.clear();
10187     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10188     ss << "created class " << device_class << " with id " << class_id
10189        << " to crush map";
10190     goto update;
10191   } else if (prefix == "osd crush class rm") {
10192     string device_class;
10193     if (!cmd_getval(cmdmap, "class", device_class)) {
10194        err = -EINVAL; // no value!
10195        goto reply;
10196      }
10197     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10198        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10199          << "luminous' before using crush device classes";
10200        err = -EPERM;
10201        goto reply;
10202      }
10203
10204      if (!osdmap.crush->class_exists(device_class)) {
10205        err = 0;
10206        goto reply;
10207      }
10208
10209      CrushWrapper newcrush = _get_pending_crush();
10210      if (!newcrush.class_exists(device_class)) {
10211        err = 0; // make command idempotent
10212        goto wait;
10213      }
10214      int class_id = newcrush.get_class_id(device_class);
10215      stringstream ts;
10216      if (newcrush.class_is_in_use(class_id, &ts)) {
10217        err = -EBUSY;
10218        ss << "class '" << device_class << "' " << ts.str();
10219        goto reply;
10220      }
10221
10222      // check if class is used by any erasure-code-profiles
10223      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10224        osdmap.get_erasure_code_profiles();
10225      auto ec_profiles = pending_inc.get_erasure_code_profiles();
10226 #ifdef HAVE_STDLIB_MAP_SPLICING
10227      ec_profiles.merge(old_ec_profiles);
10228 #else
10229      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10230                         make_move_iterator(end(old_ec_profiles)));
10231 #endif
10232      list<string> referenced_by;
10233      for (auto &i: ec_profiles) {
10234        for (auto &j: i.second) {
10235          if ("crush-device-class" == j.first && device_class == j.second) {
10236            referenced_by.push_back(i.first);
10237          }
10238        }
10239      }
10240      if (!referenced_by.empty()) {
10241        err = -EBUSY;
10242        ss << "class '" << device_class
10243           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10244        goto reply;
10245      }
10246
10247      set<int> osds;
10248      newcrush.get_devices_by_class(device_class, &osds);
10249      for (auto& p: osds) {
10250        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10251        if (err < 0) {
10252          // ss has reason for failure
10253          goto reply;
10254        }
10255      }
10256
10257      if (osds.empty()) {
10258        // empty class, remove directly
10259        err = newcrush.remove_class_name(device_class);
10260        if (err < 0) {
10261          ss << "class '" << device_class << "' cannot be removed '"
10262             << cpp_strerror(err) << "'";
10263          goto reply;
10264        }
10265      }
10266
10267      pending_inc.crush.clear();
10268      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10269      ss << "removed class " << device_class << " with id " << class_id
10270         << " from crush map";
10271      goto update;
10272   } else if (prefix == "osd crush class rename") {
10273     string srcname, dstname;
10274     if (!cmd_getval(cmdmap, "srcname", srcname)) {
10275       err = -EINVAL;
10276       goto reply;
10277     }
10278     if (!cmd_getval(cmdmap, "dstname", dstname)) {
10279       err = -EINVAL;
10280       goto reply;
10281     }
10282
10283     CrushWrapper newcrush = _get_pending_crush();
10284     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10285       // suppose this is a replay and return success
10286       // so command is idempotent
10287       ss << "already renamed to '" << dstname << "'";
10288       err = 0;
10289       goto reply;
10290     }
10291
10292     err = newcrush.rename_class(srcname, dstname);
10293     if (err < 0) {
10294       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10295          << cpp_strerror(err);
10296       goto reply;
10297     }
10298
10299     pending_inc.crush.clear();
10300     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10301     ss << "rename class '" << srcname << "' to '" << dstname << "'";
10302     goto update;
10303   } else if (prefix == "osd crush add-bucket") {
10304     // os crush add-bucket <name> <type>
10305     string name, typestr;
10306     vector<string> argvec;
10307     cmd_getval(cmdmap, "name", name);
10308     cmd_getval(cmdmap, "type", typestr);
10309     cmd_getval(cmdmap, "args", argvec);
10310     map<string,string> loc;
10311     if (!argvec.empty()) {
10312       CrushWrapper::parse_loc_map(argvec, &loc);
10313       dout(0) << "will create and move bucket '" << name
10314               << "' to location " << loc << dendl;
10315     }
10316
10317     if (!_have_pending_crush() &&
10318         _get_stable_crush().name_exists(name)) {
10319       ss << "bucket '" << name << "' already exists";
10320       goto reply;
10321     }
10322
10323     CrushWrapper newcrush = _get_pending_crush();
10324
10325     if (newcrush.name_exists(name)) {
10326       ss << "bucket '" << name << "' already exists";
10327       goto update;
10328     }
10329     int type = newcrush.get_type_id(typestr);
10330     if (type < 0) {
10331       ss << "type '" << typestr << "' does not exist";
10332       err = -EINVAL;
10333       goto reply;
10334     }
10335     if (type == 0) {
10336       ss << "type '" << typestr << "' is for devices, not buckets";
10337       err = -EINVAL;
10338       goto reply;
10339     }
10340     int bucketno;
10341     err = newcrush.add_bucket(0, 0,
10342                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10343                               NULL, &bucketno);
10344     if (err < 0) {
10345       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10346       goto reply;
10347     }
10348     err = newcrush.set_item_name(bucketno, name);
10349     if (err < 0) {
10350       ss << "error setting bucket name to '" << name << "'";
10351       goto reply;
10352     }
10353
10354     if (!loc.empty()) {
10355       if (!newcrush.check_item_loc(cct, bucketno, loc,
10356           (int *)NULL)) {
10357         err = newcrush.move_bucket(cct, bucketno, loc);
10358         if (err < 0) {
10359           ss << "error moving bucket '" << name << "' to location " << loc;
10360           goto reply;
10361         }
10362       } else {
10363         ss << "no need to move item id " << bucketno << " name '" << name
10364            << "' to location " << loc << " in crush map";
10365       }
10366     }
10367
10368     pending_inc.crush.clear();
10369     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10370     if (loc.empty()) {
10371       ss << "added bucket " << name << " type " << typestr
10372          << " to crush map";
10373     } else {
10374       ss << "added bucket " << name << " type " << typestr
10375          << " to location " << loc;
10376     }
10377     goto update;
10378   } else if (prefix == "osd crush rename-bucket") {
10379     string srcname, dstname;
10380     cmd_getval(cmdmap, "srcname", srcname);
10381     cmd_getval(cmdmap, "dstname", dstname);
10382
10383     err = crush_rename_bucket(srcname, dstname, &ss);
10384     if (err == -EALREADY) // equivalent to success for idempotency
10385       err = 0;
10386     if (err)
10387       goto reply;
10388     else
10389       goto update;
10390   } else if (prefix == "osd crush weight-set create" ||
10391              prefix == "osd crush weight-set create-compat") {
10392     if (_have_pending_crush()) {
10393       dout(10) << " first waiting for pending crush changes to commit" << dendl;
10394       goto wait;
10395     }
10396     CrushWrapper newcrush = _get_pending_crush();
10397     int64_t pool;
10398     int positions;
10399     if (newcrush.has_non_straw2_buckets()) {
10400       ss << "crush map contains one or more bucket(s) that are not straw2";
10401       err = -EPERM;
10402       goto reply;
10403     }
10404     if (prefix == "osd crush weight-set create") {
10405       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10406           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10407         ss << "require_min_compat_client "
10408            << osdmap.require_min_compat_client
10409            << " < luminous, which is required for per-pool weight-sets. "
10410            << "Try 'ceph osd set-require-min-compat-client luminous' "
10411            << "before using the new interface";
10412         err = -EPERM;
10413         goto reply;
10414       }
10415       string poolname, mode;
10416       cmd_getval(cmdmap, "pool", poolname);
10417       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10418       if (pool < 0) {
10419         ss << "pool '" << poolname << "' not found";
10420         err = -ENOENT;
10421         goto reply;
10422       }
10423       cmd_getval(cmdmap, "mode", mode);
10424       if (mode != "flat" && mode != "positional") {
10425         ss << "unrecognized weight-set mode '" << mode << "'";
10426         err = -EINVAL;
10427         goto reply;
10428       }
10429       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10430     } else {
10431       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10432       positions = 1;
10433     }
10434     if (!newcrush.create_choose_args(pool, positions)) {
10435       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10436         ss << "compat weight-set already created";
10437       } else {
10438         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10439            << "' already created";
10440       }
10441       goto reply;
10442     }
10443     pending_inc.crush.clear();
10444     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10445     goto update;
10446
10447   } else if (prefix == "osd crush weight-set rm" ||
10448              prefix == "osd crush weight-set rm-compat") {
10449     CrushWrapper newcrush = _get_pending_crush();
10450     int64_t pool;
10451     if (prefix == "osd crush weight-set rm") {
10452       string poolname;
10453       cmd_getval(cmdmap, "pool", poolname);
10454       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10455       if (pool < 0) {
10456         ss << "pool '" << poolname << "' not found";
10457         err = -ENOENT;
10458         goto reply;
10459       }
10460     } else {
10461       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10462     }
10463     newcrush.rm_choose_args(pool);
10464     pending_inc.crush.clear();
10465     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10466     goto update;
10467
10468   } else if (prefix == "osd crush weight-set reweight" ||
10469              prefix == "osd crush weight-set reweight-compat") {
10470     string poolname, item;
10471     vector<double> weight;
10472     cmd_getval(cmdmap, "pool", poolname);
10473     cmd_getval(cmdmap, "item", item);
10474     cmd_getval(cmdmap, "weight", weight);
10475     CrushWrapper newcrush = _get_pending_crush();
10476     int64_t pool;
10477     if (prefix == "osd crush weight-set reweight") {
10478       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10479       if (pool < 0) {
10480         ss << "pool '" << poolname << "' not found";
10481         err = -ENOENT;
10482         goto reply;
10483       }
10484       if (!newcrush.have_choose_args(pool)) {
10485         ss << "no weight-set for pool '" << poolname << "'";
10486         err = -ENOENT;
10487         goto reply;
10488       }
10489       auto arg_map = newcrush.choose_args_get(pool);
10490       int positions = newcrush.get_choose_args_positions(arg_map);
10491       if (weight.size() != (size_t)positions) {
10492          ss << "must specify exact " << positions << " weight values";
10493          err = -EINVAL;
10494          goto reply;
10495       }
10496     } else {
10497       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10498       if (!newcrush.have_choose_args(pool)) {
10499         ss << "no backward-compatible weight-set";
10500         err = -ENOENT;
10501         goto reply;
10502       }
10503     }
10504     if (!newcrush.name_exists(item)) {
10505       ss << "item '" << item << "' does not exist";
10506       err = -ENOENT;
10507       goto reply;
10508     }
10509     err = newcrush.choose_args_adjust_item_weightf(
10510       cct,
10511       newcrush.choose_args_get(pool),
10512       newcrush.get_item_id(item),
10513       weight,
10514       &ss);
10515     if (err < 0) {
10516       goto reply;
10517     }
10518     err = 0;
10519     pending_inc.crush.clear();
10520     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10521     goto update;
10522   } else if (osdid_present &&
10523              (prefix == "osd crush set" || prefix == "osd crush add")) {
10524     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10525     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10526     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10527
10528     if (!osdmap.exists(osdid)) {
10529       err = -ENOENT;
10530       ss << osd_name
10531          << " does not exist. Create it before updating the crush map";
10532       goto reply;
10533     }
10534
10535     double weight;
10536     if (!cmd_getval(cmdmap, "weight", weight)) {
10537       ss << "unable to parse weight value '"
10538          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10539       err = -EINVAL;
10540       goto reply;
10541     }
10542
10543     string args;
10544     vector<string> argvec;
10545     cmd_getval(cmdmap, "args", argvec);
10546     map<string,string> loc;
10547     CrushWrapper::parse_loc_map(argvec, &loc);
10548
10549     if (prefix == "osd crush set"
10550         && !_get_stable_crush().item_exists(osdid)) {
10551       err = -ENOENT;
10552       ss << "unable to set item id " << osdid << " name '" << osd_name
10553          << "' weight " << weight << " at location " << loc
10554          << ": does not exist";
10555       goto reply;
10556     }
10557
10558     dout(5) << "adding/updating crush item id " << osdid << " name '"
10559       << osd_name << "' weight " << weight << " at location "
10560       << loc << dendl;
10561     CrushWrapper newcrush = _get_pending_crush();
10562
10563     string action;
10564     if (prefix == "osd crush set" ||
10565         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10566       action = "set";
10567       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10568     } else {
10569       action = "add";
10570       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10571       if (err == 0)
10572         err = 1;
10573     }
10574
10575     if (err < 0)
10576       goto reply;
10577
10578     if (err == 0 && !_have_pending_crush()) {
10579       ss << action << " item id " << osdid << " name '" << osd_name
10580          << "' weight " << weight << " at location " << loc << ": no change";
10581       goto reply;
10582     }
10583
10584     pending_inc.crush.clear();
10585     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10586     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10587        << weight << " at location " << loc << " to crush map";
10588     getline(ss, rs);
10589     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10590                                                       get_last_committed() + 1));
10591     return true;
10592
10593   } else if (prefix == "osd crush create-or-move") {
10594     do {
10595       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10596       if (!osdmap.exists(osdid)) {
10597         err = -ENOENT;
10598         ss << osd_name
10599            << " does not exist.  create it before updating the crush map";
10600         goto reply;
10601       }
10602
10603       double weight;
10604       if (!cmd_getval(cmdmap, "weight", weight)) {
10605         ss << "unable to parse weight value '"
10606            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10607         err = -EINVAL;
10608         goto reply;
10609       }
10610
10611       string args;
10612       vector<string> argvec;
10613       cmd_getval(cmdmap, "args", argvec);
10614       map<string,string> loc;
10615       CrushWrapper::parse_loc_map(argvec, &loc);
10616
10617       dout(0) << "create-or-move crush item name '" << osd_name
10618               << "' initial_weight " << weight << " at location " << loc
10619               << dendl;
10620
10621       CrushWrapper newcrush = _get_pending_crush();
10622
10623       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10624                                          g_conf()->osd_crush_update_weight_set);
10625       if (err == 0) {
10626         ss << "create-or-move updated item name '" << osd_name
10627            << "' weight " << weight
10628            << " at location " << loc << " to crush map";
10629         break;
10630       }
10631       if (err > 0) {
10632         pending_inc.crush.clear();
10633         newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10634         ss << "create-or-move updating item name '" << osd_name
10635            << "' weight " << weight
10636            << " at location " << loc << " to crush map";
10637         getline(ss, rs);
10638         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10639                                                   get_last_committed() + 1));
10640         return true;
10641       }
10642     } while (false);
10643
10644   } else if (prefix == "osd crush move") {
10645     do {
10646       // osd crush move <name> <loc1> [<loc2> ...]
10647       string name;
10648       vector<string> argvec;
10649       cmd_getval(cmdmap, "name", name);
10650       cmd_getval(cmdmap, "args", argvec);
10651       map<string,string> loc;
10652       CrushWrapper::parse_loc_map(argvec, &loc);
10653
10654       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10655       CrushWrapper newcrush = _get_pending_crush();
10656
10657       if (!newcrush.name_exists(name)) {
10658         err = -ENOENT;
10659         ss << "item " << name << " does not exist";
10660         break;
10661       }
10662       int id = newcrush.get_item_id(name);
10663
10664       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10665         if (id >= 0) {
10666           err = newcrush.create_or_move_item(
10667             cct, id, 0, name, loc,
10668             g_conf()->osd_crush_update_weight_set);
10669         } else {
10670           err = newcrush.move_bucket(cct, id, loc);
10671         }
10672         if (err >= 0) {
10673           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10674           pending_inc.crush.clear();
10675           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10676           getline(ss, rs);
10677           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10678                                                    get_last_committed() + 1));
10679           return true;
10680         }
10681       } else {
10682         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10683         err = 0;
10684       }
10685     } while (false);
10686   } else if (prefix == "osd crush swap-bucket") {
10687     string source, dest;
10688     cmd_getval(cmdmap, "source", source);
10689     cmd_getval(cmdmap, "dest", dest);
10690
10691     bool force = false;
10692     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10693
10694     CrushWrapper newcrush = _get_pending_crush();
10695     if (!newcrush.name_exists(source)) {
10696       ss << "source item " << source << " does not exist";
10697       err = -ENOENT;
10698       goto reply;
10699     }
10700     if (!newcrush.name_exists(dest)) {
10701       ss << "dest item " << dest << " does not exist";
10702       err = -ENOENT;
10703       goto reply;
10704     }
10705     int sid = newcrush.get_item_id(source);
10706     int did = newcrush.get_item_id(dest);
10707     int sparent;
10708     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10709       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10710       err = -EPERM;
10711       goto reply;
10712     }
10713     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10714         !force) {
10715       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10716          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10717          << "; pass --yes-i-really-mean-it to proceed anyway";
10718       err = -EPERM;
10719       goto reply;
10720     }
10721     int r = newcrush.swap_bucket(cct, sid, did);
10722     if (r < 0) {
10723       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10724       err = r;
10725       goto reply;
10726     }
10727     ss << "swapped bucket of " << source << " to " << dest;
10728     pending_inc.crush.clear();
10729     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10730     wait_for_finished_proposal(op,
10731                                new Monitor::C_Command(mon, op, err, ss.str(),
10732                                                       get_last_committed() + 1));
10733     return true;
10734   } else if (prefix == "osd crush link") {
10735     // osd crush link <name> <loc1> [<loc2> ...]
10736     string name;
10737     cmd_getval(cmdmap, "name", name);
10738     vector<string> argvec;
10739     cmd_getval(cmdmap, "args", argvec);
10740     map<string,string> loc;
10741     CrushWrapper::parse_loc_map(argvec, &loc);
10742
10743     // Need an explicit check for name_exists because get_item_id returns
10744     // 0 on unfound.
10745     int id = osdmap.crush->get_item_id(name);
10746     if (!osdmap.crush->name_exists(name)) {
10747       err = -ENOENT;
10748       ss << "item " << name << " does not exist";
10749       goto reply;
10750     } else {
10751       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10752     }
10753     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10754       ss << "no need to move item id " << id << " name '" << name
10755          << "' to location " << loc << " in crush map";
10756       err = 0;
10757       goto reply;
10758     }
10759
10760     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10761     CrushWrapper newcrush = _get_pending_crush();
10762
10763     if (!newcrush.name_exists(name)) {
10764       err = -ENOENT;
10765       ss << "item " << name << " does not exist";
10766       goto reply;
10767     } else {
10768       int id = newcrush.get_item_id(name);
10769       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10770         err = newcrush.link_bucket(cct, id, loc);
10771         if (err >= 0) {
10772           ss << "linked item id " << id << " name '" << name
10773              << "' to location " << loc << " in crush map";
10774           pending_inc.crush.clear();
10775           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10776         } else {
10777           ss << "cannot link item id " << id << " name '" << name
10778              << "' to location " << loc;
10779           goto reply;
10780         }
10781       } else {
10782         ss << "no need to move item id " << id << " name '" << name
10783            << "' to location " << loc << " in crush map";
10784         err = 0;
10785       }
10786     }
10787     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10788                                               get_last_committed() + 1));
10789     return true;
10790   } else if (prefix == "osd crush rm" ||
10791              prefix == "osd crush remove" ||
10792              prefix == "osd crush unlink") {
10793     do {
10794       // osd crush rm <id> [ancestor]
10795       CrushWrapper newcrush = _get_pending_crush();
10796
10797       string name;
10798       cmd_getval(cmdmap, "name", name);
10799
10800       if (!osdmap.crush->name_exists(name)) {
10801         err = 0;
10802         ss << "device '" << name << "' does not appear in the crush map";
10803         break;
10804       }
10805       if (!newcrush.name_exists(name)) {
10806         err = 0;
10807         ss << "device '" << name << "' does not appear in the crush map";
10808         getline(ss, rs);
10809         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10810                                                   get_last_committed() + 1));
10811         return true;
10812       }
10813       int id = newcrush.get_item_id(name);
10814       int ancestor = 0;
10815
10816       bool unlink_only = prefix == "osd crush unlink";
10817       string ancestor_str;
10818       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10819         if (!newcrush.name_exists(ancestor_str)) {
10820           err = -ENOENT;
10821           ss << "ancestor item '" << ancestor_str
10822              << "' does not appear in the crush map";
10823           break;
10824         }
10825         ancestor = newcrush.get_item_id(ancestor_str);
10826       }
10827
10828       err = prepare_command_osd_crush_remove(
10829           newcrush,
10830           id, ancestor,
10831           (ancestor < 0), unlink_only);
10832
10833       if (err == -ENOENT) {
10834         ss << "item " << id << " does not appear in that position";
10835         err = 0;
10836         break;
10837       }
10838       if (err == 0) {
10839         if (!unlink_only)
10840           pending_inc.new_crush_node_flags[id] = 0;
10841         ss << "removed item id " << id << " name '" << name << "' from crush map";
10842         getline(ss, rs);
10843         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10844                                                   get_last_committed() + 1));
10845         return true;
10846       }
10847     } while (false);
10848
10849   } else if (prefix == "osd crush reweight-all") {
10850     CrushWrapper newcrush = _get_pending_crush();
10851
10852     newcrush.reweight(cct);
10853     pending_inc.crush.clear();
10854     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10855     ss << "reweighted crush hierarchy";
10856     getline(ss, rs);
10857     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10858                                                   get_last_committed() + 1));
10859     return true;
10860   } else if (prefix == "osd crush reweight") {
10861     // osd crush reweight <name> <weight>
10862     CrushWrapper newcrush = _get_pending_crush();
10863
10864     string name;
10865     cmd_getval(cmdmap, "name", name);
10866     if (!newcrush.name_exists(name)) {
10867       err = -ENOENT;
10868       ss << "device '" << name << "' does not appear in the crush map";
10869       goto reply;
10870     }
10871
10872     int id = newcrush.get_item_id(name);
10873     if (id < 0) {
10874       ss << "device '" << name << "' is not a leaf in the crush map";
10875       err = -EINVAL;
10876       goto reply;
10877     }
10878     double w;
10879     if (!cmd_getval(cmdmap, "weight", w)) {
10880       ss << "unable to parse weight value '"
10881          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10882       err = -EINVAL;
10883       goto reply;
10884     }
10885
10886     err = newcrush.adjust_item_weightf(cct, id, w,
10887                                        g_conf()->osd_crush_update_weight_set);
10888     if (err < 0)
10889       goto reply;
10890     pending_inc.crush.clear();
10891     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10892     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10893        << " in crush map";
10894     getline(ss, rs);
10895     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10896                                                   get_last_committed() + 1));
10897     return true;
10898   } else if (prefix == "osd crush reweight-subtree") {
10899     // osd crush reweight <name> <weight>
10900     CrushWrapper newcrush = _get_pending_crush();
10901
10902     string name;
10903     cmd_getval(cmdmap, "name", name);
10904     if (!newcrush.name_exists(name)) {
10905       err = -ENOENT;
10906       ss << "device '" << name << "' does not appear in the crush map";
10907       goto reply;
10908     }
10909
10910     int id = newcrush.get_item_id(name);
10911     if (id >= 0) {
10912       ss << "device '" << name << "' is not a subtree in the crush map";
10913       err = -EINVAL;
10914       goto reply;
10915     }
10916     double w;
10917     if (!cmd_getval(cmdmap, "weight", w)) {
10918       ss << "unable to parse weight value '"
10919          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10920       err = -EINVAL;
10921       goto reply;
10922     }
10923
10924     err = newcrush.adjust_subtree_weightf(cct, id, w,
10925                                           g_conf()->osd_crush_update_weight_set);
10926     if (err < 0)
10927       goto reply;
10928     pending_inc.crush.clear();
10929     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10930     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10931        << " in crush map";
10932     getline(ss, rs);
10933     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10934                                               get_last_committed() + 1));
10935     return true;
10936   } else if (prefix == "osd crush tunables") {
10937     CrushWrapper newcrush = _get_pending_crush();
10938
10939     err = 0;
10940     string profile;
10941     cmd_getval(cmdmap, "profile", profile);
10942     if (profile == "legacy" || profile == "argonaut") {
10943       newcrush.set_tunables_legacy();
10944     } else if (profile == "bobtail") {
10945       newcrush.set_tunables_bobtail();
10946     } else if (profile == "firefly") {
10947       newcrush.set_tunables_firefly();
10948     } else if (profile == "hammer") {
10949       newcrush.set_tunables_hammer();
10950     } else if (profile == "jewel") {
10951       newcrush.set_tunables_jewel();
10952     } else if (profile == "optimal") {
10953       newcrush.set_tunables_optimal();
10954     } else if (profile == "default") {
10955       newcrush.set_tunables_default();
10956     } else {
10957       ss << "unrecognized profile '" << profile << "'";
10958       err = -EINVAL;
10959       goto reply;
10960     }
10961
10962     if (!validate_crush_against_features(&newcrush, ss)) {
10963       err = -EINVAL;
10964       goto reply;
10965     }
10966
10967     pending_inc.crush.clear();
10968     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10969     ss << "adjusted tunables profile to " << profile;
10970     getline(ss, rs);
10971     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10972                                               get_last_committed() + 1));
10973     return true;
10974   } else if (prefix == "osd crush set-tunable") {
10975     CrushWrapper newcrush = _get_pending_crush();
10976
10977     err = 0;
10978     string tunable;
10979     cmd_getval(cmdmap, "tunable", tunable);
10980
10981     int64_t value = -1;
10982     if (!cmd_getval(cmdmap, "value", value)) {
10983       err = -EINVAL;
10984       ss << "failed to parse integer value "
10985          << cmd_vartype_stringify(cmdmap.at("value"));
10986       goto reply;
10987     }
10988
10989     if (tunable == "straw_calc_version") {
10990       if (value != 0 && value != 1) {
10991         ss << "value must be 0 or 1; got " << value;
10992         err = -EINVAL;
10993         goto reply;
10994       }
10995       newcrush.set_straw_calc_version(value);
10996     } else {
10997       ss << "unrecognized tunable '" << tunable << "'";
10998       err = -EINVAL;
10999       goto reply;
11000     }
11001
11002     if (!validate_crush_against_features(&newcrush, ss)) {
11003       err = -EINVAL;
11004       goto reply;
11005     }
11006
11007     pending_inc.crush.clear();
11008     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11009     ss << "adjusted tunable " << tunable << " to " << value;
11010     getline(ss, rs);
11011     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11012                                               get_last_committed() + 1));
11013     return true;
11014
11015   } else if (prefix == "osd crush rule create-simple") {
11016     string name, root, type, mode;
11017     cmd_getval(cmdmap, "name", name);
11018     cmd_getval(cmdmap, "root", root);
11019     cmd_getval(cmdmap, "type", type);
11020     cmd_getval(cmdmap, "mode", mode);
11021     if (mode == "")
11022       mode = "firstn";
11023
11024     if (osdmap.crush->rule_exists(name)) {
11025       // The name is uniquely associated to a ruleid and the rule it contains
11026       // From the user point of view, the rule is more meaningfull.
11027       ss << "rule " << name << " already exists";
11028       err = 0;
11029       goto reply;
11030     }
11031
11032     CrushWrapper newcrush = _get_pending_crush();
11033
11034     if (newcrush.rule_exists(name)) {
11035       // The name is uniquely associated to a ruleid and the rule it contains
11036       // From the user point of view, the rule is more meaningfull.
11037       ss << "rule " << name << " already exists";
11038       err = 0;
11039     } else {
11040       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11041                                                pg_pool_t::TYPE_REPLICATED, &ss);
11042       if (ruleno < 0) {
11043         err = ruleno;
11044         goto reply;
11045       }
11046
11047       pending_inc.crush.clear();
11048       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11049     }
11050     getline(ss, rs);
11051     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11052                                               get_last_committed() + 1));
11053     return true;
11054
11055   } else if (prefix == "osd crush rule create-replicated") {
11056     string name, root, type, device_class;
11057     cmd_getval(cmdmap, "name", name);
11058     cmd_getval(cmdmap, "root", root);
11059     cmd_getval(cmdmap, "type", type);
11060     cmd_getval(cmdmap, "class", device_class);
11061
11062     if (osdmap.crush->rule_exists(name)) {
11063       // The name is uniquely associated to a ruleid and the rule it contains
11064       // From the user point of view, the rule is more meaningfull.
11065       ss << "rule " << name << " already exists";
11066       err = 0;
11067       goto reply;
11068     }
11069
11070     CrushWrapper newcrush = _get_pending_crush();
11071
11072     if (newcrush.rule_exists(name)) {
11073       // The name is uniquely associated to a ruleid and the rule it contains
11074       // From the user point of view, the rule is more meaningfull.
11075       ss << "rule " << name << " already exists";
11076       err = 0;
11077     } else {
11078       int ruleno = newcrush.add_simple_rule(
11079         name, root, type, device_class,
11080         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11081       if (ruleno < 0) {
11082         err = ruleno;
11083         goto reply;
11084       }
11085
11086       pending_inc.crush.clear();
11087       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11088     }
11089     getline(ss, rs);
11090     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11091                                               get_last_committed() + 1));
11092     return true;
11093
11094   } else if (prefix == "osd erasure-code-profile rm") {
11095     string name;
11096     cmd_getval(cmdmap, "name", name);
11097
11098     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11099       goto wait;
11100
11101     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11102       err = -EBUSY;
11103       goto reply;
11104     }
11105
11106     if (osdmap.has_erasure_code_profile(name) ||
11107         pending_inc.new_erasure_code_profiles.count(name)) {
11108       if (osdmap.has_erasure_code_profile(name)) {
11109         pending_inc.old_erasure_code_profiles.push_back(name);
11110       } else {
11111         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11112         pending_inc.new_erasure_code_profiles.erase(name);
11113       }
11114
11115       getline(ss, rs);
11116       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11117                                                         get_last_committed() + 1));
11118       return true;
11119     } else {
11120       ss << "erasure-code-profile " << name << " does not exist";
11121       err = 0;
11122       goto reply;
11123     }
11124
11125   } else if (prefix == "osd erasure-code-profile set") {
11126     string name;
11127     cmd_getval(cmdmap, "name", name);
11128     vector<string> profile;
11129     cmd_getval(cmdmap, "profile", profile);
11130
11131     bool force = false;
11132     cmd_getval(cmdmap, "force", force);
11133
11134     map<string,string> profile_map;
11135     err = parse_erasure_code_profile(profile, &profile_map, &ss);
11136     if (err)
11137       goto reply;
11138     if (auto found = profile_map.find("crush-failure-domain");
11139         found != profile_map.end()) {
11140       const auto& failure_domain = found->second;
11141       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11142       if (failure_domain_type < 0) {
11143         ss << "erasure-code-profile " << profile_map
11144           << " contains an invalid failure-domain " << std::quoted(failure_domain);
11145         err = -EINVAL;
11146         goto reply;
11147       }
11148     }
11149
11150     if (profile_map.find("plugin") == profile_map.end()) {
11151       ss << "erasure-code-profile " << profile_map
11152          << " must contain a plugin entry" << std::endl;
11153       err = -EINVAL;
11154       goto reply;
11155     }
11156     string plugin = profile_map["plugin"];
11157
11158     if (pending_inc.has_erasure_code_profile(name)) {
11159       dout(20) << "erasure code profile " << name << " try again" << dendl;
11160       goto wait;
11161     } else {
11162       err = normalize_profile(name, profile_map, force, &ss);
11163       if (err)
11164         goto reply;
11165
11166       if (osdmap.has_erasure_code_profile(name)) {
11167         ErasureCodeProfile existing_profile_map =
11168           osdmap.get_erasure_code_profile(name);
11169         err = normalize_profile(name, existing_profile_map, force, &ss);
11170         if (err)
11171           goto reply;
11172
11173         if (existing_profile_map == profile_map) {
11174           err = 0;
11175           goto reply;
11176         }
11177         if (!force) {
11178           err = -EPERM;
11179           ss << "will not override erasure code profile " << name
11180              << " because the existing profile "
11181              << existing_profile_map
11182              << " is different from the proposed profile "
11183              << profile_map;
11184           goto reply;
11185         }
11186       }
11187
11188       dout(20) << "erasure code profile set " << name << "="
11189                << profile_map << dendl;
11190       pending_inc.set_erasure_code_profile(name, profile_map);
11191     }
11192
11193     getline(ss, rs);
11194     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11195                                                       get_last_committed() + 1));
11196     return true;
11197
11198   } else if (prefix == "osd crush rule create-erasure") {
11199     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11200     if (err == -EAGAIN)
11201       goto wait;
11202     if (err)
11203       goto reply;
11204     string name, poolstr;
11205     cmd_getval(cmdmap, "name", name);
11206     string profile;
11207     cmd_getval(cmdmap, "profile", profile);
11208     if (profile == "")
11209       profile = "default";
11210     if (profile == "default") {
11211       if (!osdmap.has_erasure_code_profile(profile)) {
11212         if (pending_inc.has_erasure_code_profile(profile)) {
11213           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11214           goto wait;
11215         }
11216
11217         map<string,string> profile_map;
11218         err = osdmap.get_erasure_code_profile_default(cct,
11219                                                       profile_map,
11220                                                       &ss);
11221         if (err)
11222           goto reply;
11223         err = normalize_profile(name, profile_map, true, &ss);
11224         if (err)
11225           goto reply;
11226         dout(20) << "erasure code profile set " << profile << "="
11227                  << profile_map << dendl;
11228         pending_inc.set_erasure_code_profile(profile, profile_map);
11229         goto wait;
11230       }
11231     }
11232
11233     int rule;
11234     err = crush_rule_create_erasure(name, profile, &rule, &ss);
11235     if (err < 0) {
11236       switch(err) {
11237       case -EEXIST: // return immediately
11238         ss << "rule " << name << " already exists";
11239         err = 0;
11240         goto reply;
11241         break;
11242       case -EALREADY: // wait for pending to be proposed
11243         ss << "rule " << name << " already exists";
11244         err = 0;
11245         break;
11246       default: // non recoverable error
11247         goto reply;
11248         break;
11249       }
11250     } else {
11251       ss << "created rule " << name << " at " << rule;
11252     }
11253
11254     getline(ss, rs);
11255     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11256                                                       get_last_committed() + 1));
11257     return true;
11258
11259   } else if (prefix == "osd crush rule rm") {
11260     string name;
11261     cmd_getval(cmdmap, "name", name);
11262
11263     if (!osdmap.crush->rule_exists(name)) {
11264       ss << "rule " << name << " does not exist";
11265       err = 0;
11266       goto reply;
11267     }
11268
11269     CrushWrapper newcrush = _get_pending_crush();
11270
11271     if (!newcrush.rule_exists(name)) {
11272       ss << "rule " << name << " does not exist";
11273       err = 0;
11274     } else {
11275       int ruleno = newcrush.get_rule_id(name);
11276       ceph_assert(ruleno >= 0);
11277
11278       // make sure it is not in use.
11279       // FIXME: this is ok in some situations, but let's not bother with that
11280       // complexity now.
11281       if (osdmap.crush_rule_in_use(ruleno)) {
11282         ss << "crush rule " << name << " (" << ruleno << ") is in use";
11283         err = -EBUSY;
11284         goto reply;
11285       }
11286
11287       err = newcrush.remove_rule(ruleno);
11288       if (err < 0) {
11289         goto reply;
11290       }
11291
11292       pending_inc.crush.clear();
11293       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11294     }
11295     getline(ss, rs);
11296     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11297                                               get_last_committed() + 1));
11298     return true;
11299
11300   } else if (prefix == "osd crush rule rename") {
11301     string srcname;
11302     string dstname;
11303     cmd_getval(cmdmap, "srcname", srcname);
11304     cmd_getval(cmdmap, "dstname", dstname);
11305     if (srcname.empty() || dstname.empty()) {
11306       ss << "must specify both source rule name and destination rule name";
11307       err = -EINVAL;
11308       goto reply;
11309     }
11310     if (srcname == dstname) {
11311       ss << "destination rule name is equal to source rule name";
11312       err = 0;
11313       goto reply;
11314     }
11315
11316     CrushWrapper newcrush = _get_pending_crush();
11317     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11318       // srcname does not exist and dstname already exists
11319       // suppose this is a replay and return success
11320       // (so this command is idempotent)
11321       ss << "already renamed to '" << dstname << "'";
11322       err = 0;
11323       goto reply;
11324     }
11325
11326     err = newcrush.rename_rule(srcname, dstname, &ss);
11327     if (err < 0) {
11328       // ss has reason for failure
11329       goto reply;
11330     }
11331     pending_inc.crush.clear();
11332     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11333     getline(ss, rs);
11334     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11335                                get_last_committed() + 1));
11336     return true;
11337
11338   } else if (prefix == "osd setmaxosd") {
11339     int64_t newmax;
11340     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11341       ss << "unable to parse 'newmax' value '"
11342          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11343       err = -EINVAL;
11344       goto reply;
11345     }
11346
11347     if (newmax > g_conf()->mon_max_osd) {
11348       err = -ERANGE;
11349       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11350          << g_conf()->mon_max_osd << ")";
11351       goto reply;
11352     }
11353
11354     // Don't allow shrinking OSD number as this will cause data loss
11355     // and may cause kernel crashes.
11356     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11357     if (newmax < osdmap.get_max_osd()) {
11358       // Check if the OSDs exist between current max and new value.
11359       // If there are any OSDs exist, then don't allow shrinking number
11360       // of OSDs.
11361       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11362         if (osdmap.exists(i)) {
11363           err = -EBUSY;
11364           ss << "cannot shrink max_osd to " << newmax
11365              << " because osd." << i << " (and possibly others) still in use";
11366           goto reply;
11367         }
11368       }
11369     }
11370
11371     pending_inc.new_max_osd = newmax;
11372     ss << "set new max_osd = " << pending_inc.new_max_osd;
11373     getline(ss, rs);
11374     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11375                                               get_last_committed() + 1));
11376     return true;
11377
11378   } else if (prefix == "osd set-full-ratio" ||
11379              prefix == "osd set-backfillfull-ratio" ||
11380              prefix == "osd set-nearfull-ratio") {
11381     double n;
11382     if (!cmd_getval(cmdmap, "ratio", n)) {
11383       ss << "unable to parse 'ratio' value '"
11384          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11385       err = -EINVAL;
11386       goto reply;
11387     }
11388     if (prefix == "osd set-full-ratio")
11389       pending_inc.new_full_ratio = n;
11390     else if (prefix == "osd set-backfillfull-ratio")
11391       pending_inc.new_backfillfull_ratio = n;
11392     else if (prefix == "osd set-nearfull-ratio")
11393       pending_inc.new_nearfull_ratio = n;
11394     ss << prefix << " " << n;
11395     getline(ss, rs);
11396     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11397                                               get_last_committed() + 1));
11398     return true;
11399   } else if (prefix == "osd set-require-min-compat-client") {
11400     string v;
11401     cmd_getval(cmdmap, "version", v);
11402     ceph_release_t vno = ceph_release_from_name(v);
11403     if (!vno) {
11404       ss << "version " << v << " is not recognized";
11405       err = -EINVAL;
11406       goto reply;
11407     }
11408     OSDMap newmap;
11409     newmap.deepish_copy_from(osdmap);
11410     newmap.apply_incremental(pending_inc);
11411     newmap.require_min_compat_client = vno;
11412     auto mvno = newmap.get_min_compat_client();
11413     if (vno < mvno) {
11414       ss << "osdmap current utilizes features that require " << mvno
11415          << "; cannot set require_min_compat_client below that to " << vno;
11416       err = -EPERM;
11417       goto reply;
11418     }
11419     bool sure = false;
11420     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11421     if (!sure) {
11422       FeatureMap m;
11423       mon.get_combined_feature_map(&m);
11424       uint64_t features = ceph_release_features(to_integer<int>(vno));
11425       bool first = true;
11426       bool ok = true;
11427       for (int type : {
11428             CEPH_ENTITY_TYPE_CLIENT,
11429             CEPH_ENTITY_TYPE_MDS,
11430             CEPH_ENTITY_TYPE_MGR }) {
11431         auto p = m.m.find(type);
11432         if (p == m.m.end()) {
11433           continue;
11434         }
11435         for (auto& q : p->second) {
11436           uint64_t missing = ~q.first & features;
11437           if (missing) {
11438             if (first) {
11439               ss << "cannot set require_min_compat_client to " << v << ": ";
11440             } else {
11441               ss << "; ";
11442             }
11443             first = false;
11444             ss << q.second << " connected " << ceph_entity_type_name(type)
11445                << "(s) look like " << ceph_release_name(
11446                  ceph_release_from_features(q.first))
11447                << " (missing 0x" << std::hex << missing << std::dec << ")";
11448             ok = false;
11449           }
11450         }
11451       }
11452       if (!ok) {
11453         ss << "; add --yes-i-really-mean-it to do it anyway";
11454         err = -EPERM;
11455         goto reply;
11456       }
11457     }
11458     ss << "set require_min_compat_client to " << vno;
11459     pending_inc.new_require_min_compat_client = vno;
11460     getline(ss, rs);
11461     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11462                                                           get_last_committed() + 1));
11463     return true;
11464   } else if (prefix == "osd pause") {
11465     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11466
11467   } else if (prefix == "osd unpause") {
11468     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11469
11470   } else if (prefix == "osd set") {
11471     bool sure = false;
11472     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11473
11474     string key;
11475     cmd_getval(cmdmap, "key", key);
11476     if (key == "pause")
11477       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11478     else if (key == "noup")
11479       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11480     else if (key == "nodown")
11481       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11482     else if (key == "noout")
11483       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11484     else if (key == "noin")
11485       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11486     else if (key == "nobackfill")
11487       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11488     else if (key == "norebalance")
11489       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11490     else if (key == "norecover")
11491       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11492     else if (key == "noscrub")
11493       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11494     else if (key == "nodeep-scrub")
11495       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11496     else if (key == "notieragent")
11497       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11498     else if (key == "nosnaptrim")
11499       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11500     else if (key == "pglog_hardlimit") {
11501       if (!osdmap.get_num_up_osds() && !sure) {
11502         ss << "Not advisable to continue since no OSDs are up. Pass "
11503            << "--yes-i-really-mean-it if you really wish to continue.";
11504         err = -EPERM;
11505         goto reply;
11506       }
11507       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11508       // we are reusing a jewel feature bit that was retired in luminous.
11509       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11510          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11511           || sure)) {
11512         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11513       } else {
11514         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11515         err = -EPERM;
11516         goto reply;
11517       }
11518     } else {
11519       ss << "unrecognized flag '" << key << "'";
11520       err = -EINVAL;
11521     }
11522
11523   } else if (prefix == "osd unset") {
11524     string key;
11525     cmd_getval(cmdmap, "key", key);
11526     if (key == "pause")
11527       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11528     else if (key == "noup")
11529       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11530     else if (key == "nodown")
11531       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11532     else if (key == "noout")
11533       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11534     else if (key == "noin")
11535       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11536     else if (key == "nobackfill")
11537       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11538     else if (key == "norebalance")
11539       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11540     else if (key == "norecover")
11541       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11542     else if (key == "noscrub")
11543       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11544     else if (key == "nodeep-scrub")
11545       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11546     else if (key == "notieragent")
11547       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11548     else if (key == "nosnaptrim")
11549       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11550     else {
11551       ss << "unrecognized flag '" << key << "'";
11552       err = -EINVAL;
11553     }
11554
11555   } else if (prefix == "osd require-osd-release") {
11556     string release;
11557     cmd_getval(cmdmap, "release", release);
11558     bool sure = false;
11559     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11560     ceph_release_t rel = ceph_release_from_name(release.c_str());
11561     if (!rel) {
11562       ss << "unrecognized release " << release;
11563       err = -EINVAL;
11564       goto reply;
11565     }
11566     if (rel == osdmap.require_osd_release) {
11567       // idempotent
11568       err = 0;
11569       goto reply;
11570     }
11571     ceph_assert(osdmap.require_osd_release >= ceph_release_t::octopus);
11572     if (!osdmap.get_num_up_osds() && !sure) {
11573       ss << "Not advisable to continue since no OSDs are up. Pass "
11574          << "--yes-i-really-mean-it if you really wish to continue.";
11575       err = -EPERM;
11576       goto reply;
11577     }
11578     if (rel == ceph_release_t::octopus) {
11579       if (!mon.monmap->get_required_features().contains_all(
11580             ceph::features::mon::FEATURE_OCTOPUS)) {
11581         ss << "not all mons are octopus";
11582         err = -EPERM;
11583         goto reply;
11584       }
11585       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11586            && !sure) {
11587         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11588         err = -EPERM;
11589         goto reply;
11590       }
11591     } else if (rel == ceph_release_t::pacific) {
11592       if (!mon.monmap->get_required_features().contains_all(
11593             ceph::features::mon::FEATURE_PACIFIC)) {
11594         ss << "not all mons are pacific";
11595         err = -EPERM;
11596         goto reply;
11597       }
11598       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11599            && !sure) {
11600         ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11601         err = -EPERM;
11602         goto reply;
11603       }
11604     } else if (rel == ceph_release_t::quincy) {
11605       if (!mon.monmap->get_required_features().contains_all(
11606             ceph::features::mon::FEATURE_QUINCY)) {
11607         ss << "not all mons are quincy";
11608         err = -EPERM;
11609         goto reply;
11610       }
11611       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11612            && !sure) {
11613         ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11614         err = -EPERM;
11615         goto reply;
11616       }
11617     } else {
11618       ss << "not supported for this release";
11619       err = -EPERM;
11620       goto reply;
11621     }
11622     if (rel < osdmap.require_osd_release) {
11623       ss << "require_osd_release cannot be lowered once it has been set";
11624       err = -EPERM;
11625       goto reply;
11626     }
11627     pending_inc.new_require_osd_release = rel;
11628     goto update;
11629   } else if (prefix == "osd down" ||
11630              prefix == "osd out" ||
11631              prefix == "osd in" ||
11632              prefix == "osd rm" ||
11633              prefix == "osd stop") {
11634
11635     bool any = false;
11636     bool stop = false;
11637     bool verbose = true;
11638     bool definitely_dead = false;
11639
11640     vector<string> idvec;
11641     cmd_getval(cmdmap, "ids", idvec);
11642     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11643     derr << "definitely_dead " << (int)definitely_dead << dendl;
11644     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11645       set<int> osds;
11646
11647       // wildcard?
11648       if (j == 0 &&
11649           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11650         if (prefix == "osd in") {
11651           // touch out osds only
11652           osdmap.get_out_existing_osds(osds);
11653         } else {
11654           osdmap.get_all_osds(osds);
11655         }
11656         stop = true;
11657         verbose = false; // so the output is less noisy.
11658       } else {
11659         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11660         if (osd < 0) {
11661           ss << "invalid osd id" << osd;
11662           err = -EINVAL;
11663           continue;
11664         } else if (!osdmap.exists(osd)) {
11665           ss << "osd." << osd << " does not exist. ";
11666           continue;
11667         }
11668
11669         osds.insert(osd);
11670       }
11671
11672       for (auto &osd : osds) {
11673         if (prefix == "osd down") {
11674           if (osdmap.is_down(osd)) {
11675             if (verbose)
11676               ss << "osd." << osd << " is already down. ";
11677           } else {
11678             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11679             ss << "marked down osd." << osd << ". ";
11680             any = true;
11681           }
11682           if (definitely_dead) {
11683             if (!pending_inc.new_xinfo.count(osd)) {
11684               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11685             }
11686             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11687               any = true;
11688             }
11689             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11690           }
11691         } else if (prefix == "osd out") {
11692           if (osdmap.is_out(osd)) {
11693             if (verbose)
11694               ss << "osd." << osd << " is already out. ";
11695           } else {
11696             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11697             if (osdmap.osd_weight[osd]) {
11698               if (pending_inc.new_xinfo.count(osd) == 0) {
11699                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11700               }
11701               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11702             }
11703             ss << "marked out osd." << osd << ". ";
11704             std::ostringstream msg;
11705             msg << "Client " << op->get_session()->entity_name
11706                 << " marked osd." << osd << " out";
11707             if (osdmap.is_up(osd)) {
11708               msg << ", while it was still marked up";
11709             } else {
11710               auto period = ceph_clock_now() - down_pending_out[osd];
11711               msg << ", after it was down for " << int(period.sec())
11712                   << " seconds";
11713             }
11714
11715             mon.clog->info() << msg.str();
11716             any = true;
11717           }
11718         } else if (prefix == "osd in") {
11719           if (osdmap.is_in(osd)) {
11720             if (verbose)
11721               ss << "osd." << osd << " is already in. ";
11722           } else {
11723             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11724               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11725               if (pending_inc.new_xinfo.count(osd) == 0) {
11726                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11727               }
11728               pending_inc.new_xinfo[osd].old_weight = 0;
11729             } else {
11730               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11731             }
11732             ss << "marked in osd." << osd << ". ";
11733             any = true;
11734           }
11735         } else if (prefix == "osd rm") {
11736           err = prepare_command_osd_remove(osd);
11737
11738           if (err == -EBUSY) {
11739             if (any)
11740               ss << ", ";
11741             ss << "osd." << osd << " is still up; must be down before removal. ";
11742           } else {
11743             ceph_assert(err == 0);
11744             if (any) {
11745               ss << ", osd." << osd;
11746             } else {
11747               ss << "removed osd." << osd;
11748             }
11749             any = true;
11750           }
11751         } else if (prefix == "osd stop") {
11752           if (osdmap.is_stop(osd)) {
11753             if (verbose)
11754               ss << "osd." << osd << " is already stopped. ";
11755           } else if (osdmap.is_down(osd)) {
11756             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11757             ss << "stop down osd." << osd << ". ";
11758             any = true;
11759           } else {
11760             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11761             ss << "stop osd." << osd << ". ";
11762             any = true;
11763           }
11764         }
11765       }
11766     }
11767     if (any) {
11768       getline(ss, rs);
11769       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11770                                                 get_last_committed() + 1));
11771       return true;
11772     }
11773   } else if (prefix == "osd set-group" ||
11774              prefix == "osd unset-group" ||
11775              prefix == "osd add-noup" ||
11776              prefix == "osd add-nodown" ||
11777              prefix == "osd add-noin" ||
11778              prefix == "osd add-noout" ||
11779              prefix == "osd rm-noup" ||
11780              prefix == "osd rm-nodown" ||
11781              prefix == "osd rm-noin" ||
11782              prefix == "osd rm-noout") {
11783     bool do_set = prefix == "osd set-group" ||
11784                   prefix.find("add") != string::npos;
11785     string flag_str;
11786     unsigned flags = 0;
11787     vector<string> who;
11788     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11789       cmd_getval(cmdmap, "flags", flag_str);
11790       cmd_getval(cmdmap, "who", who);
11791       vector<string> raw_flags;
11792       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11793       for (auto& f : raw_flags) {
11794         if (f == "noup")
11795           flags |= CEPH_OSD_NOUP;
11796         else if (f == "nodown")
11797           flags |= CEPH_OSD_NODOWN;
11798         else if (f == "noin")
11799           flags |= CEPH_OSD_NOIN;
11800         else if (f == "noout")
11801           flags |= CEPH_OSD_NOOUT;
11802         else {
11803           ss << "unrecognized flag '" << f << "', must be one of "
11804              << "{noup,nodown,noin,noout}";
11805           err = -EINVAL;
11806           goto reply;
11807         }
11808       }
11809     } else {
11810       cmd_getval(cmdmap, "ids", who);
11811       if (prefix.find("noup") != string::npos)
11812         flags = CEPH_OSD_NOUP;
11813       else if (prefix.find("nodown") != string::npos)
11814         flags = CEPH_OSD_NODOWN;
11815       else if (prefix.find("noin") != string::npos)
11816         flags = CEPH_OSD_NOIN;
11817       else if (prefix.find("noout") != string::npos)
11818         flags = CEPH_OSD_NOOUT;
11819       else
11820         ceph_assert(0 == "Unreachable!");
11821     }
11822     if (flags == 0) {
11823       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11824       err = -EINVAL;
11825       goto reply;
11826     }
11827     if (who.empty()) {
11828       ss << "must specify at least one or more targets to set/unset";
11829       err = -EINVAL;
11830       goto reply;
11831     }
11832     set<int> osds;
11833     set<int> crush_nodes;
11834     set<int> device_classes;
11835     for (auto& w : who) {
11836       if (w == "any" || w == "all" || w == "*") {
11837         osdmap.get_all_osds(osds);
11838         break;
11839       }
11840       std::stringstream ts;
11841       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11842         osds.insert(osd);
11843       } else if (osdmap.crush->name_exists(w)) {
11844         crush_nodes.insert(osdmap.crush->get_item_id(w));
11845       } else if (osdmap.crush->class_exists(w)) {
11846         device_classes.insert(osdmap.crush->get_class_id(w));
11847       } else {
11848         ss << "unable to parse osd id or crush node or device class: "
11849            << "\"" << w << "\". ";
11850       }
11851     }
11852     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11853       // ss has reason for failure
11854       err = -EINVAL;
11855       goto reply;
11856     }
11857     bool any = false;
11858     for (auto osd : osds) {
11859       if (!osdmap.exists(osd)) {
11860         ss << "osd." << osd << " does not exist. ";
11861         continue;
11862       }
11863       if (do_set) {
11864         if (flags & CEPH_OSD_NOUP) {
11865           any |= osdmap.is_noup_by_osd(osd) ?
11866             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11867             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11868         }
11869         if (flags & CEPH_OSD_NODOWN) {
11870           any |= osdmap.is_nodown_by_osd(osd) ?
11871             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11872             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11873         }
11874         if (flags & CEPH_OSD_NOIN) {
11875           any |= osdmap.is_noin_by_osd(osd) ?
11876             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11877             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11878         }
11879         if (flags & CEPH_OSD_NOOUT) {
11880           any |= osdmap.is_noout_by_osd(osd) ?
11881             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11882             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11883         }
11884       } else {
11885         if (flags & CEPH_OSD_NOUP) {
11886           any |= osdmap.is_noup_by_osd(osd) ?
11887             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11888             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11889         }
11890         if (flags & CEPH_OSD_NODOWN) {
11891           any |= osdmap.is_nodown_by_osd(osd) ?
11892             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11893             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11894         }
11895         if (flags & CEPH_OSD_NOIN) {
11896           any |= osdmap.is_noin_by_osd(osd) ?
11897             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11898             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11899         }
11900         if (flags & CEPH_OSD_NOOUT) {
11901           any |= osdmap.is_noout_by_osd(osd) ?
11902             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11903             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11904         }
11905       }
11906     }
11907     for (auto& id : crush_nodes) {
11908       auto old_flags = osdmap.get_crush_node_flags(id);
11909       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11910       pending_flags |= old_flags; // adopt existing flags first!
11911       if (do_set) {
11912         pending_flags |= flags;
11913       } else {
11914         pending_flags &= ~flags;
11915       }
11916       any = true;
11917     }
11918     for (auto& id : device_classes) {
11919       auto old_flags = osdmap.get_device_class_flags(id);
11920       auto& pending_flags = pending_inc.new_device_class_flags[id];
11921       pending_flags |= old_flags;
11922       if (do_set) {
11923         pending_flags |= flags;
11924       } else {
11925         pending_flags &= ~flags;
11926       }
11927       any = true;
11928     }
11929     if (any) {
11930       getline(ss, rs);
11931       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11932                                  get_last_committed() + 1));
11933       return true;
11934     }
11935   } else if (prefix == "osd pg-temp") {
11936     string pgidstr;
11937     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11938       ss << "unable to parse 'pgid' value '"
11939          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11940       err = -EINVAL;
11941       goto reply;
11942     }
11943     pg_t pgid;
11944     if (!pgid.parse(pgidstr.c_str())) {
11945       ss << "invalid pgid '" << pgidstr << "'";
11946       err = -EINVAL;
11947       goto reply;
11948     }
11949     if (!osdmap.pg_exists(pgid)) {
11950       ss << "pg " << pgid << " does not exist";
11951       err = -ENOENT;
11952       goto reply;
11953     }
11954     if (pending_inc.new_pg_temp.count(pgid)) {
11955       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11956       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11957       return true;
11958     }
11959
11960     vector<int64_t> id_vec;
11961     vector<int32_t> new_pg_temp;
11962     cmd_getval(cmdmap, "id", id_vec);
11963     if (id_vec.empty())  {
11964       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11965       ss << "done cleaning up pg_temp of " << pgid;
11966       goto update;
11967     }
11968     for (auto osd : id_vec) {
11969       if (!osdmap.exists(osd)) {
11970         ss << "osd." << osd << " does not exist";
11971         err = -ENOENT;
11972         goto reply;
11973       }
11974       new_pg_temp.push_back(osd);
11975     }
11976
11977     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11978     if ((int)new_pg_temp.size() < pool_min_size) {
11979       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11980          << pool_min_size << ")";
11981       err = -EINVAL;
11982       goto reply;
11983     }
11984
11985     int pool_size = osdmap.get_pg_pool_size(pgid);
11986     if ((int)new_pg_temp.size() > pool_size) {
11987       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11988          << pool_size << ")";
11989       err = -EINVAL;
11990       goto reply;
11991     }
11992
11993     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11994       new_pg_temp.begin(), new_pg_temp.end());
11995     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11996     goto update;
11997   } else if (prefix == "osd primary-temp") {
11998     string pgidstr;
11999     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12000       ss << "unable to parse 'pgid' value '"
12001          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12002       err = -EINVAL;
12003       goto reply;
12004     }
12005     pg_t pgid;
12006     if (!pgid.parse(pgidstr.c_str())) {
12007       ss << "invalid pgid '" << pgidstr << "'";
12008       err = -EINVAL;
12009       goto reply;
12010     }
12011     if (!osdmap.pg_exists(pgid)) {
12012       ss << "pg " << pgid << " does not exist";
12013       err = -ENOENT;
12014       goto reply;
12015     }
12016
12017     int64_t osd;
12018     if (!cmd_getval(cmdmap, "id", osd)) {
12019       ss << "unable to parse 'id' value '"
12020          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12021       err = -EINVAL;
12022       goto reply;
12023     }
12024     if (osd != -1 && !osdmap.exists(osd)) {
12025       ss << "osd." << osd << " does not exist";
12026       err = -ENOENT;
12027       goto reply;
12028     }
12029
12030     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12031         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12032       ss << "require_min_compat_client "
12033          << osdmap.require_min_compat_client
12034          << " < firefly, which is required for primary-temp";
12035       err = -EPERM;
12036       goto reply;
12037     }
12038
12039     pending_inc.new_primary_temp[pgid] = osd;
12040     ss << "set " << pgid << " primary_temp mapping to " << osd;
12041     goto update;
12042   } else if (prefix == "pg repeer") {
12043     pg_t pgid;
12044     string pgidstr;
12045     cmd_getval(cmdmap, "pgid", pgidstr);
12046     if (!pgid.parse(pgidstr.c_str())) {
12047       ss << "invalid pgid '" << pgidstr << "'";
12048       err = -EINVAL;
12049       goto reply;
12050     }
12051     if (!osdmap.pg_exists(pgid)) {
12052       ss << "pg '" << pgidstr << "' does not exist";
12053       err = -ENOENT;
12054       goto reply;
12055     }
12056     vector<int> acting;
12057     int primary;
12058     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12059     if (primary < 0) {
12060       err = -EAGAIN;
12061       ss << "pg currently has no primary";
12062       goto reply;
12063     }
12064     if (acting.size() > 1) {
12065       // map to just primary; it will map back to what it wants
12066       pending_inc.new_pg_temp[pgid] = { primary };
12067     } else {
12068       // hmm, pick another arbitrary osd to induce a change.  Note
12069       // that this won't work if there is only one suitable OSD in the cluster.
12070       int i;
12071       bool done = false;
12072       for (i = 0; i < osdmap.get_max_osd(); ++i) {
12073         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12074           continue;
12075         }
12076         pending_inc.new_pg_temp[pgid] = { primary, i };
12077         done = true;
12078         break;
12079       }
12080       if (!done) {
12081         err = -EAGAIN;
12082         ss << "not enough up OSDs in the cluster to force repeer";
12083         goto reply;
12084       }
12085     }
12086     goto update;
12087   } else if (prefix == "osd pg-upmap" ||
12088              prefix == "osd rm-pg-upmap" ||
12089              prefix == "osd pg-upmap-items" ||
12090              prefix == "osd rm-pg-upmap-items") {
12091     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
12092       ss << "min_compat_client "
12093          << osdmap.require_min_compat_client
12094          << " < luminous, which is required for pg-upmap. "
12095          << "Try 'ceph osd set-require-min-compat-client luminous' "
12096          << "before using the new interface";
12097       err = -EPERM;
12098       goto reply;
12099     }
12100     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
12101     if (err == -EAGAIN)
12102       goto wait;
12103     if (err < 0)
12104       goto reply;
12105     string pgidstr;
12106     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12107       ss << "unable to parse 'pgid' value '"
12108          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12109       err = -EINVAL;
12110       goto reply;
12111     }
12112     pg_t pgid;
12113     if (!pgid.parse(pgidstr.c_str())) {
12114       ss << "invalid pgid '" << pgidstr << "'";
12115       err = -EINVAL;
12116       goto reply;
12117     }
12118     if (!osdmap.pg_exists(pgid)) {
12119       ss << "pg " << pgid << " does not exist";
12120       err = -ENOENT;
12121       goto reply;
12122     }
12123     if (pending_inc.old_pools.count(pgid.pool())) {
12124       ss << "pool of " << pgid << " is pending removal";
12125       err = -ENOENT;
12126       getline(ss, rs);
12127       wait_for_finished_proposal(op,
12128         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12129       return true;
12130     }
12131
12132     enum {
12133       OP_PG_UPMAP,
12134       OP_RM_PG_UPMAP,
12135       OP_PG_UPMAP_ITEMS,
12136       OP_RM_PG_UPMAP_ITEMS,
12137     } option;
12138
12139     if (prefix == "osd pg-upmap") {
12140       option = OP_PG_UPMAP;
12141     } else if (prefix == "osd rm-pg-upmap") {
12142       option = OP_RM_PG_UPMAP;
12143     } else if (prefix == "osd pg-upmap-items") {
12144       option = OP_PG_UPMAP_ITEMS;
12145     } else {
12146       option = OP_RM_PG_UPMAP_ITEMS;
12147     }
12148
12149     // check pending upmap changes
12150     switch (option) {
12151     case OP_PG_UPMAP: // fall through
12152     case OP_RM_PG_UPMAP:
12153       if (pending_inc.new_pg_upmap.count(pgid) ||
12154           pending_inc.old_pg_upmap.count(pgid)) {
12155         dout(10) << __func__ << " waiting for pending update on "
12156                  << pgid << dendl;
12157         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12158         return true;
12159       }
12160       break;
12161
12162     case OP_PG_UPMAP_ITEMS: // fall through
12163     case OP_RM_PG_UPMAP_ITEMS:
12164       if (pending_inc.new_pg_upmap_items.count(pgid) ||
12165           pending_inc.old_pg_upmap_items.count(pgid)) {
12166         dout(10) << __func__ << " waiting for pending update on "
12167                  << pgid << dendl;
12168         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12169         return true;
12170       }
12171       break;
12172
12173     default:
12174       ceph_abort_msg("invalid option");
12175     }
12176
12177     switch (option) {
12178     case OP_PG_UPMAP:
12179       {
12180         vector<int64_t> id_vec;
12181         if (!cmd_getval(cmdmap, "id", id_vec)) {
12182           ss << "unable to parse 'id' value(s) '"
12183              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12184           err = -EINVAL;
12185           goto reply;
12186         }
12187
12188         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12189         if ((int)id_vec.size() < pool_min_size) {
12190           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12191              << pool_min_size << ")";
12192           err = -EINVAL;
12193           goto reply;
12194         }
12195
12196         int pool_size = osdmap.get_pg_pool_size(pgid);
12197         if ((int)id_vec.size() > pool_size) {
12198           ss << "num of osds (" << id_vec.size() <<") > pool size ("
12199              << pool_size << ")";
12200           err = -EINVAL;
12201           goto reply;
12202         }
12203
12204         vector<int32_t> new_pg_upmap;
12205         for (auto osd : id_vec) {
12206           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12207             ss << "osd." << osd << " does not exist";
12208             err = -ENOENT;
12209             goto reply;
12210           }
12211           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12212           if (it != new_pg_upmap.end()) {
12213             ss << "osd." << osd << " already exists, ";
12214             continue;
12215           }
12216           new_pg_upmap.push_back(osd);
12217         }
12218
12219         if (new_pg_upmap.empty()) {
12220           ss << "no valid upmap items(pairs) is specified";
12221           err = -EINVAL;
12222           goto reply;
12223         }
12224
12225         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12226           new_pg_upmap.begin(), new_pg_upmap.end());
12227         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12228       }
12229       break;
12230
12231     case OP_RM_PG_UPMAP:
12232       {
12233         pending_inc.old_pg_upmap.insert(pgid);
12234         ss << "clear " << pgid << " pg_upmap mapping";
12235       }
12236       break;
12237
12238     case OP_PG_UPMAP_ITEMS:
12239       {
12240         vector<int64_t> id_vec;
12241         if (!cmd_getval(cmdmap, "id", id_vec)) {
12242           ss << "unable to parse 'id' value(s) '"
12243              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12244           err = -EINVAL;
12245           goto reply;
12246         }
12247
12248         if (id_vec.size() % 2) {
12249           ss << "you must specify pairs of osd ids to be remapped";
12250           err = -EINVAL;
12251           goto reply;
12252         }
12253
12254         int pool_size = osdmap.get_pg_pool_size(pgid);
12255         if ((int)(id_vec.size() / 2) > pool_size) {
12256           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12257              << pool_size << ")";
12258           err = -EINVAL;
12259           goto reply;
12260         }
12261
12262         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12263         ostringstream items;
12264         items << "[";
12265         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12266           int from = *p++;
12267           int to = *p;
12268           if (from == to) {
12269             ss << "from osd." << from << " == to osd." << to << ", ";
12270             continue;
12271           }
12272           if (!osdmap.exists(from)) {
12273             ss << "osd." << from << " does not exist";
12274             err = -ENOENT;
12275             goto reply;
12276           }
12277           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12278             ss << "osd." << to << " does not exist";
12279             err = -ENOENT;
12280             goto reply;
12281           }
12282           pair<int32_t,int32_t> entry = make_pair(from, to);
12283           auto it = std::find(new_pg_upmap_items.begin(),
12284             new_pg_upmap_items.end(), entry);
12285           if (it != new_pg_upmap_items.end()) {
12286             ss << "osd." << from << " -> osd." << to << " already exists, ";
12287             continue;
12288           }
12289           new_pg_upmap_items.push_back(entry);
12290           items << from << "->" << to << ",";
12291         }
12292         string out(items.str());
12293         out.resize(out.size() - 1); // drop last ','
12294         out += "]";
12295
12296         if (new_pg_upmap_items.empty()) {
12297           ss << "no valid upmap items(pairs) is specified";
12298           err = -EINVAL;
12299           goto reply;
12300         }
12301
12302         pending_inc.new_pg_upmap_items[pgid] =
12303           mempool::osdmap::vector<pair<int32_t,int32_t>>(
12304           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12305         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12306       }
12307       break;
12308
12309     case OP_RM_PG_UPMAP_ITEMS:
12310       {
12311         pending_inc.old_pg_upmap_items.insert(pgid);
12312         ss << "clear " << pgid << " pg_upmap_items mapping";
12313       }
12314       break;
12315
12316     default:
12317       ceph_abort_msg("invalid option");
12318     }
12319
12320     goto update;
12321   } else if (prefix == "osd primary-affinity") {
12322     int64_t id;
12323     if (!cmd_getval(cmdmap, "id", id)) {
12324       ss << "invalid osd id value '"
12325          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12326       err = -EINVAL;
12327       goto reply;
12328     }
12329     double w;
12330     if (!cmd_getval(cmdmap, "weight", w)) {
12331       ss << "unable to parse 'weight' value '"
12332          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12333       err = -EINVAL;
12334       goto reply;
12335     }
12336     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12337     if (ww < 0L) {
12338       ss << "weight must be >= 0";
12339       err = -EINVAL;
12340       goto reply;
12341     }
12342     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12343         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12344       ss << "require_min_compat_client "
12345          << osdmap.require_min_compat_client
12346          << " < firefly, which is required for primary-affinity";
12347       err = -EPERM;
12348       goto reply;
12349     }
12350     if (osdmap.exists(id)) {
12351       pending_inc.new_primary_affinity[id] = ww;
12352       ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12353       getline(ss, rs);
12354       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12355                                                 get_last_committed() + 1));
12356       return true;
12357     } else {
12358       ss << "osd." << id << " does not exist";
12359       err = -ENOENT;
12360       goto reply;
12361     }
12362   } else if (prefix == "osd reweight") {
12363     int64_t id;
12364     if (!cmd_getval(cmdmap, "id", id)) {
12365       ss << "unable to parse osd id value '"
12366          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12367       err = -EINVAL;
12368       goto reply;
12369     }
12370     double w;
12371     if (!cmd_getval(cmdmap, "weight", w)) {
12372       ss << "unable to parse weight value '"
12373          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12374       err = -EINVAL;
12375       goto reply;
12376     }
12377     long ww = (int)((double)CEPH_OSD_IN*w);
12378     if (ww < 0L) {
12379       ss << "weight must be >= 0";
12380       err = -EINVAL;
12381       goto reply;
12382     }
12383     if (osdmap.exists(id)) {
12384       pending_inc.new_weight[id] = ww;
12385       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12386       getline(ss, rs);
12387       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12388                                                 get_last_committed() + 1));
12389       return true;
12390     } else {
12391       ss << "osd." << id << " does not exist";
12392       err = -ENOENT;
12393       goto reply;
12394     }
12395   } else if (prefix == "osd reweightn") {
12396     map<int32_t, uint32_t> weights;
12397     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12398     if (err) {
12399       ss << "unable to parse 'weights' value '"
12400          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12401       goto reply;
12402     }
12403     pending_inc.new_weight.insert(weights.begin(), weights.end());
12404     wait_for_finished_proposal(
12405         op,
12406         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12407     return true;
12408   } else if (prefix == "osd lost") {
12409     int64_t id;
12410     if (!cmd_getval(cmdmap, "id", id)) {
12411       ss << "unable to parse osd id value '"
12412          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12413       err = -EINVAL;
12414       goto reply;
12415     }
12416     bool sure = false;
12417     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12418     if (!sure) {
12419       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12420             "--yes-i-really-mean-it if you really do.";
12421       err = -EPERM;
12422       goto reply;
12423     } else if (!osdmap.exists(id)) {
12424       ss << "osd." << id << " does not exist";
12425       err = -ENOENT;
12426       goto reply;
12427     } else if (!osdmap.is_down(id)) {
12428       ss << "osd." << id << " is not down";
12429       err = -EBUSY;
12430       goto reply;
12431     } else {
12432       epoch_t e = osdmap.get_info(id).down_at;
12433       pending_inc.new_lost[id] = e;
12434       ss << "marked osd lost in epoch " << e;
12435       getline(ss, rs);
12436       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12437                                                 get_last_committed() + 1));
12438       return true;
12439     }
12440
12441   } else if (prefix == "osd destroy-actual" ||
12442              prefix == "osd purge-actual" ||
12443              prefix == "osd purge-new") {
12444     /* Destroying an OSD means that we don't expect to further make use of
12445      * the OSDs data (which may even become unreadable after this operation),
12446      * and that we are okay with scrubbing all its cephx keys and config-key
12447      * data (which may include lockbox keys, thus rendering the osd's data
12448      * unreadable).
12449      *
12450      * The OSD will not be removed. Instead, we will mark it as destroyed,
12451      * such that a subsequent call to `create` will not reuse the osd id.
12452      * This will play into being able to recreate the OSD, at the same
12453      * crush location, with minimal data movement.
12454      */
12455
12456     // make sure authmon is writeable.
12457     if (!mon.authmon()->is_writeable()) {
12458       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12459                << "osd destroy" << dendl;
12460       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12461       return false;
12462     }
12463
12464     int64_t id;
12465     if (!cmd_getval(cmdmap, "id", id)) {
12466       auto p = cmdmap.find("id");
12467       if (p == cmdmap.end()) {
12468         ss << "no osd id specified";
12469       } else {
12470         ss << "unable to parse osd id value '"
12471            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12472       }
12473       err = -EINVAL;
12474       goto reply;
12475     }
12476
12477     bool is_destroy = (prefix == "osd destroy-actual");
12478     if (!is_destroy) {
12479       ceph_assert("osd purge-actual" == prefix ||
12480              "osd purge-new" == prefix);
12481     }
12482
12483     bool sure = false;
12484     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12485     if (!sure) {
12486       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12487          << "This will mean real, permanent data loss, as well "
12488          << "as deletion of cephx and lockbox keys. "
12489          << "Pass --yes-i-really-mean-it if you really do.";
12490       err = -EPERM;
12491       goto reply;
12492     } else if (!osdmap.exists(id)) {
12493       ss << "osd." << id << " does not exist";
12494       err = 0; // idempotent
12495       goto reply;
12496     } else if (osdmap.is_up(id)) {
12497       ss << "osd." << id << " is not `down`.";
12498       err = -EBUSY;
12499       goto reply;
12500     } else if (is_destroy && osdmap.is_destroyed(id)) {
12501       ss << "destroyed osd." << id;
12502       err = 0;
12503       goto reply;
12504     }
12505
12506     if (prefix == "osd purge-new" &&
12507         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12508       ss << "osd." << id << " is not new";
12509       err = -EPERM;
12510       goto reply;
12511     }
12512
12513     bool goto_reply = false;
12514
12515     paxos.plug();
12516     if (is_destroy) {
12517       err = prepare_command_osd_destroy(id, ss);
12518       // we checked above that it should exist.
12519       ceph_assert(err != -ENOENT);
12520     } else {
12521       err = prepare_command_osd_purge(id, ss);
12522       if (err == -ENOENT) {
12523         err = 0;
12524         ss << "osd." << id << " does not exist.";
12525         goto_reply = true;
12526       }
12527     }
12528     paxos.unplug();
12529
12530     if (err < 0 || goto_reply) {
12531       goto reply;
12532     }
12533
12534     if (is_destroy) {
12535       ss << "destroyed osd." << id;
12536     } else {
12537       ss << "purged osd." << id;
12538     }
12539
12540     getline(ss, rs);
12541     wait_for_finished_proposal(op,
12542         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12543     force_immediate_propose();
12544     return true;
12545
12546   } else if (prefix == "osd new") {
12547
12548     // make sure authmon is writeable.
12549     if (!mon.authmon()->is_writeable()) {
12550       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12551                << "osd new" << dendl;
12552       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12553       return false;
12554     }
12555
12556     map<string,string> param_map;
12557
12558     bufferlist bl = m->get_data();
12559     string param_json = bl.to_str();
12560     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12561
12562     err = get_json_str_map(param_json, ss, &param_map);
12563     if (err < 0)
12564       goto reply;
12565
12566     dout(20) << __func__ << " osd new params " << param_map << dendl;
12567
12568     paxos.plug();
12569     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12570     paxos.unplug();
12571
12572     if (err < 0) {
12573       goto reply;
12574     }
12575
12576     if (f) {
12577       f->flush(rdata);
12578     } else {
12579       rdata.append(ss);
12580     }
12581
12582     if (err == EEXIST) {
12583       // idempotent operation
12584       err = 0;
12585       goto reply;
12586     }
12587
12588     wait_for_finished_proposal(op,
12589         new Monitor::C_Command(mon, op, 0, rs, rdata,
12590                                get_last_committed() + 1));
12591     force_immediate_propose();
12592     return true;
12593
12594   } else if (prefix == "osd create") {
12595
12596     // optional id provided?
12597     int64_t id = -1, cmd_id = -1;
12598     if (cmd_getval(cmdmap, "id", cmd_id)) {
12599       if (cmd_id < 0) {
12600         ss << "invalid osd id value '" << cmd_id << "'";
12601         err = -EINVAL;
12602         goto reply;
12603       }
12604       dout(10) << " osd create got id " << cmd_id << dendl;
12605     }
12606
12607     uuid_d uuid;
12608     string uuidstr;
12609     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12610       if (!uuid.parse(uuidstr.c_str())) {
12611         ss << "invalid uuid value '" << uuidstr << "'";
12612         err = -EINVAL;
12613         goto reply;
12614       }
12615       // we only care about the id if we also have the uuid, to
12616       // ensure the operation's idempotency.
12617       id = cmd_id;
12618     }
12619
12620     int32_t new_id = -1;
12621     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12622     if (err < 0) {
12623       if (err == -EAGAIN) {
12624         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12625         return true;
12626       }
12627       // a check has failed; reply to the user.
12628       goto reply;
12629
12630     } else if (err == EEXIST) {
12631       // this is an idempotent operation; we can go ahead and reply.
12632       if (f) {
12633         f->open_object_section("created_osd");
12634         f->dump_int("osdid", new_id);
12635         f->close_section();
12636         f->flush(rdata);
12637       } else {
12638         ss << new_id;
12639         rdata.append(ss);
12640       }
12641       err = 0;
12642       goto reply;
12643     }
12644
12645     string empty_device_class;
12646     do_osd_create(id, uuid, empty_device_class, &new_id);
12647
12648     if (f) {
12649       f->open_object_section("created_osd");
12650       f->dump_int("osdid", new_id);
12651       f->close_section();
12652       f->flush(rdata);
12653     } else {
12654       ss << new_id;
12655       rdata.append(ss);
12656     }
12657     wait_for_finished_proposal(op,
12658         new Monitor::C_Command(mon, op, 0, rs, rdata,
12659                                get_last_committed() + 1));
12660     return true;
12661
12662   } else if (prefix == "osd blocklist clear" ||
12663              prefix == "osd blacklist clear") {
12664     pending_inc.new_blocklist.clear();
12665     std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12666     osdmap.get_blocklist(&blocklist);
12667     for (const auto &entry : blocklist) {
12668       pending_inc.old_blocklist.push_back(entry.first);
12669     }
12670     ss << " removed all blocklist entries";
12671     getline(ss, rs);
12672     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12673                                               get_last_committed() + 1));
12674     return true;
12675   } else if (prefix == "osd blocklist" ||
12676              prefix == "osd blacklist") {
12677     string addrstr;
12678     cmd_getval(cmdmap, "addr", addrstr);
12679     entity_addr_t addr;
12680     if (!addr.parse(addrstr)) {
12681       ss << "unable to parse address " << addrstr;
12682       err = -EINVAL;
12683       goto reply;
12684     }
12685     else {
12686       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12687         // always blocklist type ANY
12688         addr.set_type(entity_addr_t::TYPE_ANY);
12689       } else {
12690         addr.set_type(entity_addr_t::TYPE_LEGACY);
12691       }
12692
12693       string blocklistop;
12694       if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12695         cmd_getval(cmdmap, "blacklistop", blocklistop);
12696       }
12697       if (blocklistop == "add") {
12698         utime_t expires = ceph_clock_now();
12699         // default one hour
12700         double d = cmd_getval_or<double>(cmdmap, "expire",
12701           g_conf()->mon_osd_blocklist_default_expire);
12702         expires += d;
12703
12704         pending_inc.new_blocklist[addr] = expires;
12705
12706         {
12707           // cancel any pending un-blocklisting request too
12708           auto it = std::find(pending_inc.old_blocklist.begin(),
12709             pending_inc.old_blocklist.end(), addr);
12710           if (it != pending_inc.old_blocklist.end()) {
12711             pending_inc.old_blocklist.erase(it);
12712           }
12713         }
12714
12715         ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12716         getline(ss, rs);
12717         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12718                                                   get_last_committed() + 1));
12719         return true;
12720       } else if (blocklistop == "rm") {
12721         if (osdmap.is_blocklisted(addr) ||
12722             pending_inc.new_blocklist.count(addr)) {
12723           if (osdmap.is_blocklisted(addr))
12724             pending_inc.old_blocklist.push_back(addr);
12725           else
12726             pending_inc.new_blocklist.erase(addr);
12727           ss << "un-blocklisting " << addr;
12728           getline(ss, rs);
12729           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12730                                                     get_last_committed() + 1));
12731           return true;
12732         }
12733         ss << addr << " isn't blocklisted";
12734         err = 0;
12735         goto reply;
12736       }
12737     }
12738   } else if (prefix == "osd pool mksnap") {
12739     string poolstr;
12740     cmd_getval(cmdmap, "pool", poolstr);
12741     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12742     if (pool < 0) {
12743       ss << "unrecognized pool '" << poolstr << "'";
12744       err = -ENOENT;
12745       goto reply;
12746     }
12747     string snapname;
12748     cmd_getval(cmdmap, "snap", snapname);
12749     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12750     if (p->is_unmanaged_snaps_mode()) {
12751       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12752       err = -EINVAL;
12753       goto reply;
12754     } else if (p->snap_exists(snapname.c_str())) {
12755       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12756       err = 0;
12757       goto reply;
12758     } else if (p->is_tier()) {
12759       ss << "pool " << poolstr << " is a cache tier";
12760       err = -EINVAL;
12761       goto reply;
12762     }
12763     pg_pool_t *pp = 0;
12764     if (pending_inc.new_pools.count(pool))
12765       pp = &pending_inc.new_pools[pool];
12766     if (!pp) {
12767       pp = &pending_inc.new_pools[pool];
12768       *pp = *p;
12769     }
12770     if (pp->snap_exists(snapname.c_str())) {
12771       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12772     } else {
12773       pp->add_snap(snapname.c_str(), ceph_clock_now());
12774       pp->set_snap_epoch(pending_inc.epoch);
12775       ss << "created pool " << poolstr << " snap " << snapname;
12776     }
12777     getline(ss, rs);
12778     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12779                                               get_last_committed() + 1));
12780     return true;
12781   } else if (prefix == "osd pool rmsnap") {
12782     string poolstr;
12783     cmd_getval(cmdmap, "pool", poolstr);
12784     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12785     if (pool < 0) {
12786       ss << "unrecognized pool '" << poolstr << "'";
12787       err = -ENOENT;
12788       goto reply;
12789     }
12790     string snapname;
12791     cmd_getval(cmdmap, "snap", snapname);
12792     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12793     if (p->is_unmanaged_snaps_mode()) {
12794       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12795       err = -EINVAL;
12796       goto reply;
12797     } else if (!p->snap_exists(snapname.c_str())) {
12798       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12799       err = 0;
12800       goto reply;
12801     }
12802     pg_pool_t *pp = 0;
12803     if (pending_inc.new_pools.count(pool))
12804       pp = &pending_inc.new_pools[pool];
12805     if (!pp) {
12806       pp = &pending_inc.new_pools[pool];
12807       *pp = *p;
12808     }
12809     snapid_t sn = pp->snap_exists(snapname.c_str());
12810     if (sn) {
12811       pp->remove_snap(sn);
12812       pp->set_snap_epoch(pending_inc.epoch);
12813       ss << "removed pool " << poolstr << " snap " << snapname;
12814     } else {
12815       ss << "already removed pool " << poolstr << " snap " << snapname;
12816     }
12817     getline(ss, rs);
12818     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12819                                               get_last_committed() + 1));
12820     return true;
12821   } else if (prefix == "osd pool create") {
12822     int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
12823     int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
12824     int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
12825     int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
12826     string pool_type_str;
12827     cmd_getval(cmdmap, "pool_type", pool_type_str);
12828     if (pool_type_str.empty())
12829       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12830
12831     string poolstr;
12832     cmd_getval(cmdmap, "pool", poolstr);
12833     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12834     if (pool_id >= 0) {
12835       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12836       if (pool_type_str != p->get_type_name()) {
12837         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12838         err = -EINVAL;
12839       } else {
12840         ss << "pool '" << poolstr << "' already exists";
12841         err = 0;
12842       }
12843       goto reply;
12844     }
12845
12846     int pool_type;
12847     if (pool_type_str == "replicated") {
12848       pool_type = pg_pool_t::TYPE_REPLICATED;
12849     } else if (pool_type_str == "erasure") {
12850       pool_type = pg_pool_t::TYPE_ERASURE;
12851     } else {
12852       ss << "unknown pool type '" << pool_type_str << "'";
12853       err = -EINVAL;
12854       goto reply;
12855     }
12856
12857     bool implicit_rule_creation = false;
12858     int64_t expected_num_objects = 0;
12859     string rule_name;
12860     cmd_getval(cmdmap, "rule", rule_name);
12861     string erasure_code_profile;
12862     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12863
12864     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12865       if (erasure_code_profile == "")
12866         erasure_code_profile = "default";
12867       //handle the erasure code profile
12868       if (erasure_code_profile == "default") {
12869         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12870           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12871             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12872             goto wait;
12873           }
12874
12875           map<string,string> profile_map;
12876           err = osdmap.get_erasure_code_profile_default(cct,
12877                                                       profile_map,
12878                                                       &ss);
12879           if (err)
12880             goto reply;
12881           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12882           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12883           goto wait;
12884         }
12885       }
12886       if (rule_name == "") {
12887         implicit_rule_creation = true;
12888         if (erasure_code_profile == "default") {
12889           rule_name = "erasure-code";
12890         } else {
12891           dout(1) << "implicitly use rule named after the pool: "
12892                 << poolstr << dendl;
12893           rule_name = poolstr;
12894         }
12895       }
12896       expected_num_objects =
12897         cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
12898     } else {
12899       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12900       //     and put expected_num_objects to rule field
12901       if (erasure_code_profile != "") { // cmd is from CLI
12902         if (rule_name != "") {
12903           string interr;
12904           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12905           if (interr.length()) {
12906             ss << "error parsing integer value '" << rule_name << "': " << interr;
12907             err = -EINVAL;
12908             goto reply;
12909           }
12910         }
12911         rule_name = erasure_code_profile;
12912       } else { // cmd is well-formed
12913         expected_num_objects =
12914           cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
12915       }
12916     }
12917
12918     if (!implicit_rule_creation && rule_name != "") {
12919       int rule;
12920       err = get_crush_rule(rule_name, &rule, &ss);
12921       if (err == -EAGAIN) {
12922         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12923         return true;
12924       }
12925       if (err)
12926         goto reply;
12927     }
12928
12929     if (expected_num_objects < 0) {
12930       ss << "'expected_num_objects' must be non-negative";
12931       err = -EINVAL;
12932       goto reply;
12933     }
12934
12935     set<int32_t> osds;
12936     osdmap.get_all_osds(osds);
12937     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12938       string type;
12939       if (!get_osd_objectstore_type(osd, &type)) {
12940         return type == "filestore";
12941       } else {
12942         return false;
12943       }
12944     });
12945
12946     if (has_filestore_osd &&
12947         expected_num_objects > 0 &&
12948         cct->_conf->filestore_merge_threshold > 0) {
12949       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12950       err = -EINVAL;
12951       goto reply;
12952     }
12953
12954     if (has_filestore_osd &&
12955         expected_num_objects == 0 &&
12956         cct->_conf->filestore_merge_threshold < 0) {
12957       int osds = osdmap.get_num_osds();
12958       bool sure = false;
12959       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12960       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12961         ss << "For better initial performance on pools expected to store a "
12962            << "large number of objects, consider supplying the "
12963            << "expected_num_objects parameter when creating the pool."
12964            << " Pass --yes-i-really-mean-it to ignore it";
12965         err = -EPERM;
12966         goto reply;
12967       }
12968     }
12969
12970     int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
12971     FastReadType fast_read = FAST_READ_DEFAULT;
12972     if (fast_read_param == 0)
12973       fast_read = FAST_READ_OFF;
12974     else if (fast_read_param > 0)
12975       fast_read = FAST_READ_ON;
12976
12977     int64_t repl_size = 0;
12978     cmd_getval(cmdmap, "size", repl_size);
12979     int64_t target_size_bytes = 0;
12980     double target_size_ratio = 0.0;
12981     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12982     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12983
12984     string pg_autoscale_mode;
12985     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12986
12987     bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
12988     err = prepare_new_pool(poolstr,
12989                            -1, // default crush rule
12990                            rule_name,
12991                            pg_num, pgp_num, pg_num_min, pg_num_max,
12992                            repl_size, target_size_bytes, target_size_ratio,
12993                            erasure_code_profile, pool_type,
12994                            (uint64_t)expected_num_objects,
12995                            fast_read,
12996                            pg_autoscale_mode,
12997                            bulk,
12998                            &ss);
12999     if (err < 0) {
13000       switch(err) {
13001       case -EEXIST:
13002         ss << "pool '" << poolstr << "' already exists";
13003         break;
13004       case -EAGAIN:
13005         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13006         return true;
13007       case -ERANGE:
13008         goto reply;
13009       default:
13010         goto reply;
13011         break;
13012       }
13013     } else {
13014       ss << "pool '" << poolstr << "' created";
13015     }
13016     getline(ss, rs);
13017     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13018                                               get_last_committed() + 1));
13019     return true;
13020
13021   } else if (prefix == "osd pool delete" ||
13022              prefix == "osd pool rm") {
13023     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13024     string poolstr, poolstr2, sure;
13025     cmd_getval(cmdmap, "pool", poolstr);
13026     cmd_getval(cmdmap, "pool2", poolstr2);
13027     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13028     if (pool < 0) {
13029       ss << "pool '" << poolstr << "' does not exist";
13030       err = 0;
13031       goto reply;
13032     }
13033
13034     bool force_no_fake = false;
13035     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13036     bool force = false;
13037     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13038     if (poolstr2 != poolstr ||
13039         (!force && !force_no_fake)) {
13040       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13041          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13042          << "followed by --yes-i-really-really-mean-it.";
13043       err = -EPERM;
13044       goto reply;
13045     }
13046     err = _prepare_remove_pool(pool, &ss, force_no_fake);
13047     if (err == -EAGAIN) {
13048       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13049       return true;
13050     }
13051     if (err < 0)
13052       goto reply;
13053     goto update;
13054   } else if (prefix == "osd pool rename") {
13055     string srcpoolstr, destpoolstr;
13056     cmd_getval(cmdmap, "srcpool", srcpoolstr);
13057     cmd_getval(cmdmap, "destpool", destpoolstr);
13058     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13059     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13060
13061     if (pool_src < 0) {
13062       if (pool_dst >= 0) {
13063         // src pool doesn't exist, dst pool does exist: to ensure idempotency
13064         // of operations, assume this rename succeeded, as it is not changing
13065         // the current state.  Make sure we output something understandable
13066         // for whoever is issuing the command, if they are paying attention,
13067         // in case it was not intentional; or to avoid a "wtf?" and a bug
13068         // report in case it was intentional, while expecting a failure.
13069         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13070           << destpoolstr << "' does -- assuming successful rename";
13071         err = 0;
13072       } else {
13073         ss << "unrecognized pool '" << srcpoolstr << "'";
13074         err = -ENOENT;
13075       }
13076       goto reply;
13077     } else if (pool_dst >= 0) {
13078       // source pool exists and so does the destination pool
13079       ss << "pool '" << destpoolstr << "' already exists";
13080       err = -EEXIST;
13081       goto reply;
13082     }
13083
13084     int ret = _prepare_rename_pool(pool_src, destpoolstr);
13085     if (ret == 0) {
13086       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13087     } else {
13088       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13089         << cpp_strerror(ret);
13090     }
13091     getline(ss, rs);
13092     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13093                                               get_last_committed() + 1));
13094     return true;
13095
13096   } else if (prefix == "osd pool set") {
13097     err = prepare_command_pool_set(cmdmap, ss);
13098     if (err == -EAGAIN)
13099       goto wait;
13100     if (err < 0)
13101       goto reply;
13102
13103     getline(ss, rs);
13104     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13105                                                    get_last_committed() + 1));
13106     return true;
13107   } else if (prefix == "osd tier add") {
13108     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13109     if (err == -EAGAIN)
13110       goto wait;
13111     if (err)
13112       goto reply;
13113     string poolstr;
13114     cmd_getval(cmdmap, "pool", poolstr);
13115     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13116     if (pool_id < 0) {
13117       ss << "unrecognized pool '" << poolstr << "'";
13118       err = -ENOENT;
13119       goto reply;
13120     }
13121     string tierpoolstr;
13122     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13123     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13124     if (tierpool_id < 0) {
13125       ss << "unrecognized pool '" << tierpoolstr << "'";
13126       err = -ENOENT;
13127       goto reply;
13128     }
13129     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13130     ceph_assert(p);
13131     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13132     ceph_assert(tp);
13133
13134     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13135       goto reply;
13136     }
13137
13138     // make sure new tier is empty
13139     bool force_nonempty = false;
13140     cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13141     const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13142     if (pstats && pstats->stats.sum.num_objects != 0 &&
13143         !force_nonempty) {
13144       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13145       err = -ENOTEMPTY;
13146       goto reply;
13147     }
13148     if (tp->is_erasure()) {
13149       ss << "tier pool '" << tierpoolstr
13150          << "' is an ec pool, which cannot be a tier";
13151       err = -ENOTSUP;
13152       goto reply;
13153     }
13154     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13155         (!force_nonempty ||
13156          !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13157       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13158       err = -ENOTEMPTY;
13159       goto reply;
13160     }
13161     // go
13162     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13163     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13164     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13165       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13166       return true;
13167     }
13168     np->tiers.insert(tierpool_id);
13169     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13170     ntp->tier_of = pool_id;
13171     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13172     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13173                                               get_last_committed() + 1));
13174     return true;
13175   } else if (prefix == "osd tier remove" ||
13176              prefix == "osd tier rm") {
13177     string poolstr;
13178     cmd_getval(cmdmap, "pool", poolstr);
13179     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13180     if (pool_id < 0) {
13181       ss << "unrecognized pool '" << poolstr << "'";
13182       err = -ENOENT;
13183       goto reply;
13184     }
13185     string tierpoolstr;
13186     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13187     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13188     if (tierpool_id < 0) {
13189       ss << "unrecognized pool '" << tierpoolstr << "'";
13190       err = -ENOENT;
13191       goto reply;
13192     }
13193     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13194     ceph_assert(p);
13195     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13196     ceph_assert(tp);
13197
13198     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13199       goto reply;
13200     }
13201
13202     if (p->tiers.count(tierpool_id) == 0) {
13203       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13204       err = 0;
13205       goto reply;
13206     }
13207     if (tp->tier_of != pool_id) {
13208       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13209          << osdmap.get_pool_name(tp->tier_of) << "': "
13210          // be scary about it; this is an inconsistency and bells must go off
13211          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13212       err = -EINVAL;
13213       goto reply;
13214     }
13215     if (p->read_tier == tierpool_id) {
13216       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13217       err = -EBUSY;
13218       goto reply;
13219     }
13220     // go
13221     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13222     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13223     if (np->tiers.count(tierpool_id) == 0 ||
13224         ntp->tier_of != pool_id ||
13225         np->read_tier == tierpool_id) {
13226       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13227       return true;
13228     }
13229     np->tiers.erase(tierpool_id);
13230     ntp->clear_tier();
13231     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13232     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13233                                               get_last_committed() + 1));
13234     return true;
13235   } else if (prefix == "osd tier set-overlay") {
13236     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13237     if (err == -EAGAIN)
13238       goto wait;
13239     if (err)
13240       goto reply;
13241     string poolstr;
13242     cmd_getval(cmdmap, "pool", poolstr);
13243     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13244     if (pool_id < 0) {
13245       ss << "unrecognized pool '" << poolstr << "'";
13246       err = -ENOENT;
13247       goto reply;
13248     }
13249     string overlaypoolstr;
13250     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13251     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13252     if (overlaypool_id < 0) {
13253       ss << "unrecognized pool '" << overlaypoolstr << "'";
13254       err = -ENOENT;
13255       goto reply;
13256     }
13257     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13258     ceph_assert(p);
13259     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13260     ceph_assert(overlay_p);
13261     if (p->tiers.count(overlaypool_id) == 0) {
13262       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13263       err = -EINVAL;
13264       goto reply;
13265     }
13266     if (p->read_tier == overlaypool_id) {
13267       err = 0;
13268       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13269       goto reply;
13270     }
13271     if (p->has_read_tier()) {
13272       ss << "pool '" << poolstr << "' has overlay '"
13273          << osdmap.get_pool_name(p->read_tier)
13274          << "'; please remove-overlay first";
13275       err = -EINVAL;
13276       goto reply;
13277     }
13278
13279     // go
13280     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13281     np->read_tier = overlaypool_id;
13282     np->write_tier = overlaypool_id;
13283     np->set_last_force_op_resend(pending_inc.epoch);
13284     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13285     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13286     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13287     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13288       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13289     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13290                                               get_last_committed() + 1));
13291     return true;
13292   } else if (prefix == "osd tier remove-overlay" ||
13293              prefix == "osd tier rm-overlay") {
13294     string poolstr;
13295     cmd_getval(cmdmap, "pool", poolstr);
13296     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13297     if (pool_id < 0) {
13298       ss << "unrecognized pool '" << poolstr << "'";
13299       err = -ENOENT;
13300       goto reply;
13301     }
13302     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13303     ceph_assert(p);
13304     if (!p->has_read_tier()) {
13305       err = 0;
13306       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13307       goto reply;
13308     }
13309
13310     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13311       goto reply;
13312     }
13313
13314     // go
13315     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13316     if (np->has_read_tier()) {
13317       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13318       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13319       nop->set_last_force_op_resend(pending_inc.epoch);
13320     }
13321     if (np->has_write_tier()) {
13322       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13323       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13324       nop->set_last_force_op_resend(pending_inc.epoch);
13325     }
13326     np->clear_read_tier();
13327     np->clear_write_tier();
13328     np->set_last_force_op_resend(pending_inc.epoch);
13329     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13330     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13331                                               get_last_committed() + 1));
13332     return true;
13333   } else if (prefix == "osd tier cache-mode") {
13334     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13335     if (err == -EAGAIN)
13336       goto wait;
13337     if (err)
13338       goto reply;
13339     string poolstr;
13340     cmd_getval(cmdmap, "pool", poolstr);
13341     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13342     if (pool_id < 0) {
13343       ss << "unrecognized pool '" << poolstr << "'";
13344       err = -ENOENT;
13345       goto reply;
13346     }
13347     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13348     ceph_assert(p);
13349     if (!p->is_tier()) {
13350       ss << "pool '" << poolstr << "' is not a tier";
13351       err = -EINVAL;
13352       goto reply;
13353     }
13354     string modestr;
13355     cmd_getval(cmdmap, "mode", modestr);
13356     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13357     if (int(mode) < 0) {
13358       ss << "'" << modestr << "' is not a valid cache mode";
13359       err = -EINVAL;
13360       goto reply;
13361     }
13362
13363     bool sure = false;
13364     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13365
13366     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13367         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13368       ss << "'" << modestr << "' is no longer a supported cache mode";
13369       err = -EPERM;
13370       goto reply;
13371     }
13372     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13373          mode != pg_pool_t::CACHEMODE_NONE &&
13374          mode != pg_pool_t::CACHEMODE_PROXY &&
13375          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13376          !sure) {
13377       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13378          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13379       err = -EPERM;
13380       goto reply;
13381     }
13382
13383     // pool already has this cache-mode set and there are no pending changes
13384     if (p->cache_mode == mode &&
13385         (pending_inc.new_pools.count(pool_id) == 0 ||
13386          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13387       ss << "set cache-mode for pool '" << poolstr << "'"
13388          << " to " << pg_pool_t::get_cache_mode_name(mode);
13389       err = 0;
13390       goto reply;
13391     }
13392
13393     /* Mode description:
13394      *
13395      *  none:       No cache-mode defined
13396      *  forward:    Forward all reads and writes to base pool [removed]
13397      *  writeback:  Cache writes, promote reads from base pool
13398      *  readonly:   Forward writes to base pool
13399      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13400      *  proxy:       Proxy all reads and writes to base pool
13401      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13402      *
13403      * Hence, these are the allowed transitions:
13404      *
13405      *  none -> any
13406      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13407      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13408      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13409      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13410      *  writeback -> readproxy || proxy
13411      *  readonly -> any
13412      */
13413
13414     // We check if the transition is valid against the current pool mode, as
13415     // it is the only committed state thus far.  We will blantly squash
13416     // whatever mode is on the pending state.
13417
13418     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13419         (mode != pg_pool_t::CACHEMODE_PROXY &&
13420           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13421       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13422          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13423          << "' pool; only '"
13424          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13425         << "' allowed.";
13426       err = -EINVAL;
13427       goto reply;
13428     }
13429     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13430         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13431           mode != pg_pool_t::CACHEMODE_PROXY &&
13432           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13433
13434         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13435         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13436           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13437
13438         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13439         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13440           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13441
13442         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13443         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13444           mode != pg_pool_t::CACHEMODE_PROXY &&
13445           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13446
13447       const pool_stat_t* pstats =
13448         mon.mgrstatmon()->get_pool_stat(pool_id);
13449
13450       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13451         ss << "unable to set cache-mode '"
13452            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13453            << "': dirty objects found";
13454         err = -EBUSY;
13455         goto reply;
13456       }
13457     }
13458     // go
13459     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13460     np->cache_mode = mode;
13461     // set this both when moving to and from cache_mode NONE.  this is to
13462     // capture legacy pools that were set up before this flag existed.
13463     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13464     ss << "set cache-mode for pool '" << poolstr
13465         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13466     if (mode == pg_pool_t::CACHEMODE_NONE) {
13467       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13468       ceph_assert(base_pool);
13469       if (base_pool->read_tier == pool_id ||
13470           base_pool->write_tier == pool_id)
13471         ss <<" (WARNING: pool is still configured as read or write tier)";
13472     }
13473     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13474                                               get_last_committed() + 1));
13475     return true;
13476   } else if (prefix == "osd tier add-cache") {
13477     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13478     if (err == -EAGAIN)
13479       goto wait;
13480     if (err)
13481       goto reply;
13482     string poolstr;
13483     cmd_getval(cmdmap, "pool", poolstr);
13484     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13485     if (pool_id < 0) {
13486       ss << "unrecognized pool '" << poolstr << "'";
13487       err = -ENOENT;
13488       goto reply;
13489     }
13490     string tierpoolstr;
13491     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13492     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13493     if (tierpool_id < 0) {
13494       ss << "unrecognized pool '" << tierpoolstr << "'";
13495       err = -ENOENT;
13496       goto reply;
13497     }
13498     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13499     ceph_assert(p);
13500     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13501     ceph_assert(tp);
13502
13503     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13504       goto reply;
13505     }
13506
13507     int64_t size = 0;
13508     if (!cmd_getval(cmdmap, "size", size)) {
13509       ss << "unable to parse 'size' value '"
13510          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13511       err = -EINVAL;
13512       goto reply;
13513     }
13514     // make sure new tier is empty
13515     const pool_stat_t *pstats =
13516       mon.mgrstatmon()->get_pool_stat(tierpool_id);
13517     if (pstats && pstats->stats.sum.num_objects != 0) {
13518       ss << "tier pool '" << tierpoolstr << "' is not empty";
13519       err = -ENOTEMPTY;
13520       goto reply;
13521     }
13522     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13523     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13524     if (int(mode) < 0) {
13525       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13526       err = -EINVAL;
13527       goto reply;
13528     }
13529     HitSet::Params hsp;
13530     auto& cache_hit_set_type =
13531       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13532     if (cache_hit_set_type == "bloom") {
13533       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13534       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13535       hsp = HitSet::Params(bsp);
13536     } else if (cache_hit_set_type == "explicit_hash") {
13537       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13538     } else if (cache_hit_set_type == "explicit_object") {
13539       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13540     } else {
13541       ss << "osd tier cache default hit set type '"
13542          << cache_hit_set_type << "' is not a known type";
13543       err = -EINVAL;
13544       goto reply;
13545     }
13546     // go
13547     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13548     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13549     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13550       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13551       return true;
13552     }
13553     np->tiers.insert(tierpool_id);
13554     np->read_tier = np->write_tier = tierpool_id;
13555     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13556     np->set_last_force_op_resend(pending_inc.epoch);
13557     ntp->set_last_force_op_resend(pending_inc.epoch);
13558     ntp->tier_of = pool_id;
13559     ntp->cache_mode = mode;
13560     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13561     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13562     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13563     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13564     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13565     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13566     ntp->hit_set_params = hsp;
13567     ntp->target_max_bytes = size;
13568     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13569     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13570                                               get_last_committed() + 1));
13571     return true;
13572   } else if (prefix == "osd pool set-quota") {
13573     string poolstr;
13574     cmd_getval(cmdmap, "pool", poolstr);
13575     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13576     if (pool_id < 0) {
13577       ss << "unrecognized pool '" << poolstr << "'";
13578       err = -ENOENT;
13579       goto reply;
13580     }
13581
13582     string field;
13583     cmd_getval(cmdmap, "field", field);
13584     if (field != "max_objects" && field != "max_bytes") {
13585       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13586       err = -EINVAL;
13587       goto reply;
13588     }
13589
13590     // val could contain unit designations, so we treat as a string
13591     string val;
13592     cmd_getval(cmdmap, "val", val);
13593     string tss;
13594     int64_t value;
13595     if (field == "max_objects") {
13596       value = strict_si_cast<uint64_t>(val, &tss);
13597     } else if (field == "max_bytes") {
13598       value = strict_iecstrtoll(val, &tss);
13599     } else {
13600       ceph_abort_msg("unrecognized option");
13601     }
13602     if (!tss.empty()) {
13603       ss << "error parsing value '" << val << "': " << tss;
13604       err = -EINVAL;
13605       goto reply;
13606     }
13607
13608     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13609     if (field == "max_objects") {
13610       pi->quota_max_objects = value;
13611     } else if (field == "max_bytes") {
13612       pi->quota_max_bytes = value;
13613     } else {
13614       ceph_abort_msg("unrecognized option");
13615     }
13616     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13617     rs = ss.str();
13618     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13619                                               get_last_committed() + 1));
13620     return true;
13621   } else if (prefix == "osd pool application enable" ||
13622              prefix == "osd pool application disable" ||
13623              prefix == "osd pool application set" ||
13624              prefix == "osd pool application rm") {
13625     err = prepare_command_pool_application(prefix, cmdmap, ss);
13626     if (err == -EAGAIN) {
13627       goto wait;
13628     } else if (err < 0) {
13629       goto reply;
13630     } else {
13631       goto update;
13632     }
13633   } else if (prefix == "osd force-create-pg") {
13634     pg_t pgid;
13635     string pgidstr;
13636     cmd_getval(cmdmap, "pgid", pgidstr);
13637     if (!pgid.parse(pgidstr.c_str())) {
13638       ss << "invalid pgid '" << pgidstr << "'";
13639       err = -EINVAL;
13640       goto reply;
13641     }
13642     if (!osdmap.pg_exists(pgid)) {
13643       ss << "pg " << pgid << " should not exist";
13644       err = -ENOENT;
13645       goto reply;
13646     }
13647     bool sure = false;
13648     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13649     if (!sure) {
13650       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13651          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13652          << "only if you are certain that all copies of the PG are in fact lost and you are "
13653          << "willing to accept that the data is permanently destroyed.  Pass "
13654          << "--yes-i-really-mean-it to proceed.";
13655       err = -EPERM;
13656       goto reply;
13657     }
13658     bool creating_now;
13659     {
13660       std::lock_guard<std::mutex> l(creating_pgs_lock);
13661       auto emplaced = creating_pgs.pgs.emplace(
13662         pgid,
13663         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13664                                        ceph_clock_now()));
13665       creating_now = emplaced.second;
13666     }
13667     if (creating_now) {
13668       ss << "pg " << pgidstr << " now creating, ok";
13669       // set the pool's CREATING flag so that (1) the osd won't ignore our
13670       // create message and (2) we won't propose any future pg_num changes
13671       // until after the PG has been instantiated.
13672       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13673         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13674       }
13675       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13676       err = 0;
13677       goto update;
13678     } else {
13679       ss << "pg " << pgid << " already creating";
13680       err = 0;
13681       goto reply;
13682     }
13683   } else if (prefix == "osd force_healthy_stretch_mode") {
13684     bool sure = false;
13685     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13686     if (!sure) {
13687       ss << "This command will require peering across multiple CRUSH buckets "
13688         "(probably two data centers or availability zones?) and may result in PGs "
13689         "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13690       err = -EPERM;
13691       goto reply;
13692     }
13693     try_end_recovery_stretch_mode(true);
13694     ss << "Triggering healthy stretch mode";
13695     err = 0;
13696     goto reply;
13697   } else if (prefix == "osd force_recovery_stretch_mode") {
13698     bool sure = false;
13699     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13700     if (!sure) {
13701       ss << "This command will increase pool sizes to try and spread them "
13702         "across multiple CRUSH buckets (probably two data centers or "
13703         "availability zones?) and should have happened automatically"
13704         "Pass --yes-i-really-mean-it to proceed.";
13705       err = -EPERM;
13706       goto reply;
13707     }
13708     mon.go_recovery_stretch_mode();
13709     ss << "Triggering recovery stretch mode";
13710     err = 0;
13711     goto reply;
13712   } else {
13713     err = -EINVAL;
13714   }
13715
13716  reply:
13717   getline(ss, rs);
13718   if (err < 0 && rs.length() == 0)
13719     rs = cpp_strerror(err);
13720   mon.reply_command(op, err, rs, rdata, get_last_committed());
13721   return ret;
13722
13723  update:
13724   getline(ss, rs);
13725   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13726                                             get_last_committed() + 1));
13727   return true;
13728
13729  wait:
13730   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13731   return true;
13732 }
13733
13734 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13735 {
13736   op->mark_osdmon_event(__func__);
13737
13738   auto m = op->get_req<MPoolOp>();
13739   MonSession *session = op->get_session();
13740   if (!session) {
13741     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13742     return true;
13743   }
13744
13745   switch (m->op) {
13746   case POOL_OP_CREATE_UNMANAGED_SNAP:
13747   case POOL_OP_DELETE_UNMANAGED_SNAP:
13748     {
13749       const std::string* pool_name = nullptr;
13750       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13751       if (pg_pool != nullptr) {
13752         pool_name = &osdmap.get_pool_name(m->pool);
13753       }
13754
13755       if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13756                                           session->entity_name, session->caps,
13757                                           session->get_peer_socket_addr(),
13758                                           pool_name)) {
13759         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13760                 << "privileges. message: " << *m  << std::endl
13761                 << "caps: " << session->caps << dendl;
13762         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13763         return true;
13764       }
13765     }
13766     break;
13767   default:
13768     if (!session->is_capable("osd", MON_CAP_W)) {
13769       dout(0) << "got pool op from entity with insufficient privileges. "
13770               << "message: " << *m  << std::endl
13771               << "caps: " << session->caps << dendl;
13772       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13773       return true;
13774     }
13775     break;
13776   }
13777
13778   return false;
13779 }
13780
13781 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13782 {
13783   op->mark_osdmon_event(__func__);
13784   auto m = op->get_req<MPoolOp>();
13785
13786   if (enforce_pool_op_caps(op)) {
13787     return true;
13788   }
13789
13790   if (m->fsid != mon.monmap->fsid) {
13791     dout(0) << __func__ << " drop message on fsid " << m->fsid
13792             << " != " << mon.monmap->fsid << " for " << *m << dendl;
13793     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13794     return true;
13795   }
13796
13797   if (m->op == POOL_OP_CREATE)
13798     return preprocess_pool_op_create(op);
13799
13800   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13801   if (p == nullptr) {
13802     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13803     if (m->op == POOL_OP_DELETE) {
13804       _pool_op_reply(op, 0, osdmap.get_epoch());
13805     } else {
13806       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13807     }
13808     return true;
13809   }
13810
13811   // check if the snap and snapname exist
13812   bool snap_exists = false;
13813   if (p->snap_exists(m->name.c_str()))
13814     snap_exists = true;
13815
13816   switch (m->op) {
13817   case POOL_OP_CREATE_SNAP:
13818     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13819       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13820       return true;
13821     }
13822     if (snap_exists) {
13823       _pool_op_reply(op, 0, osdmap.get_epoch());
13824       return true;
13825     }
13826     return false;
13827   case POOL_OP_CREATE_UNMANAGED_SNAP:
13828     if (p->is_pool_snaps_mode()) {
13829       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13830       return true;
13831     }
13832     return false;
13833   case POOL_OP_DELETE_SNAP:
13834     if (p->is_unmanaged_snaps_mode()) {
13835       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13836       return true;
13837     }
13838     if (!snap_exists) {
13839       _pool_op_reply(op, 0, osdmap.get_epoch());
13840       return true;
13841     }
13842     return false;
13843   case POOL_OP_DELETE_UNMANAGED_SNAP:
13844     if (p->is_pool_snaps_mode()) {
13845       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13846       return true;
13847     }
13848     if (_is_removed_snap(m->pool, m->snapid)) {
13849       _pool_op_reply(op, 0, osdmap.get_epoch());
13850       return true;
13851     }
13852     return false;
13853   case POOL_OP_DELETE:
13854     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13855       _pool_op_reply(op, 0, osdmap.get_epoch());
13856       return true;
13857     }
13858     return false;
13859   case POOL_OP_AUID_CHANGE:
13860     return false;
13861   default:
13862     ceph_abort();
13863     break;
13864   }
13865
13866   return false;
13867 }
13868
13869 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13870 {
13871   if (!osdmap.have_pg_pool(pool)) {
13872     dout(10) << __func__ << " pool " << pool << " snap " << snap
13873              << " - pool dne" << dendl;
13874     return true;
13875   }
13876   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13877     dout(10) << __func__ << " pool " << pool << " snap " << snap
13878              << " - in osdmap removed_snaps_queue" << dendl;
13879     return true;
13880   }
13881   snapid_t begin, end;
13882   int r = lookup_purged_snap(pool, snap, &begin, &end);
13883   if (r == 0) {
13884     dout(10) << __func__ << " pool " << pool << " snap " << snap
13885              << " - purged, [" << begin << "," << end << ")" << dendl;
13886     return true;
13887   }
13888   return false;
13889 }
13890
13891 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13892 {
13893   if (pending_inc.old_pools.count(pool)) {
13894     dout(10) << __func__ << " pool " << pool << " snap " << snap
13895              << " - pool pending deletion" << dendl;
13896     return true;
13897   }
13898   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13899     dout(10) << __func__ << " pool " << pool << " snap " << snap
13900              << " - in pending new_removed_snaps" << dendl;
13901     return true;
13902   }
13903   return false;
13904 }
13905
13906 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13907 {
13908   op->mark_osdmon_event(__func__);
13909   auto m = op->get_req<MPoolOp>();
13910   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13911   if (pool >= 0) {
13912     _pool_op_reply(op, 0, osdmap.get_epoch());
13913     return true;
13914   }
13915
13916   return false;
13917 }
13918
13919 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13920 {
13921   op->mark_osdmon_event(__func__);
13922   auto m = op->get_req<MPoolOp>();
13923   dout(10) << "prepare_pool_op " << *m << dendl;
13924   if (m->op == POOL_OP_CREATE) {
13925     return prepare_pool_op_create(op);
13926   } else if (m->op == POOL_OP_DELETE) {
13927     return prepare_pool_op_delete(op);
13928   }
13929
13930   int ret = 0;
13931   bool changed = false;
13932
13933   if (!osdmap.have_pg_pool(m->pool)) {
13934     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13935     return false;
13936   }
13937
13938   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13939
13940   switch (m->op) {
13941     case POOL_OP_CREATE_SNAP:
13942       if (pool->is_tier()) {
13943         ret = -EINVAL;
13944         _pool_op_reply(op, ret, osdmap.get_epoch());
13945         return false;
13946       }  // else, fall through
13947     case POOL_OP_DELETE_SNAP:
13948       if (!pool->is_unmanaged_snaps_mode()) {
13949         bool snap_exists = pool->snap_exists(m->name.c_str());
13950         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13951           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13952           ret = 0;
13953         } else {
13954           break;
13955         }
13956       } else {
13957         ret = -EINVAL;
13958       }
13959       _pool_op_reply(op, ret, osdmap.get_epoch());
13960       return false;
13961
13962     case POOL_OP_DELETE_UNMANAGED_SNAP:
13963       // we won't allow removal of an unmanaged snapshot from a pool
13964       // not in unmanaged snaps mode.
13965       if (!pool->is_unmanaged_snaps_mode()) {
13966         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13967         return false;
13968       }
13969       /* fall-thru */
13970     case POOL_OP_CREATE_UNMANAGED_SNAP:
13971       // but we will allow creating an unmanaged snapshot on any pool
13972       // as long as it is not in 'pool' snaps mode.
13973       if (pool->is_pool_snaps_mode()) {
13974         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13975         return false;
13976       }
13977   }
13978
13979   // projected pool info
13980   pg_pool_t pp;
13981   if (pending_inc.new_pools.count(m->pool))
13982     pp = pending_inc.new_pools[m->pool];
13983   else
13984     pp = *osdmap.get_pg_pool(m->pool);
13985
13986   bufferlist reply_data;
13987
13988   // pool snaps vs unmanaged snaps are mutually exclusive
13989   switch (m->op) {
13990   case POOL_OP_CREATE_SNAP:
13991   case POOL_OP_DELETE_SNAP:
13992     if (pp.is_unmanaged_snaps_mode()) {
13993       ret = -EINVAL;
13994       goto out;
13995     }
13996     break;
13997
13998   case POOL_OP_CREATE_UNMANAGED_SNAP:
13999   case POOL_OP_DELETE_UNMANAGED_SNAP:
14000     if (pp.is_pool_snaps_mode()) {
14001       ret = -EINVAL;
14002       goto out;
14003     }
14004   }
14005
14006   switch (m->op) {
14007   case POOL_OP_CREATE_SNAP:
14008     if (!pp.snap_exists(m->name.c_str())) {
14009       pp.add_snap(m->name.c_str(), ceph_clock_now());
14010       dout(10) << "create snap in pool " << m->pool << " " << m->name
14011                << " seq " << pp.get_snap_epoch() << dendl;
14012       changed = true;
14013     }
14014     break;
14015
14016   case POOL_OP_DELETE_SNAP:
14017     {
14018       snapid_t s = pp.snap_exists(m->name.c_str());
14019       if (s) {
14020         pp.remove_snap(s);
14021         pending_inc.new_removed_snaps[m->pool].insert(s);
14022         changed = true;
14023       }
14024     }
14025     break;
14026
14027   case POOL_OP_CREATE_UNMANAGED_SNAP:
14028     {
14029       uint64_t snapid = pp.add_unmanaged_snap(
14030         osdmap.require_osd_release < ceph_release_t::octopus);
14031       encode(snapid, reply_data);
14032       changed = true;
14033     }
14034     break;
14035
14036   case POOL_OP_DELETE_UNMANAGED_SNAP:
14037     if (!_is_removed_snap(m->pool, m->snapid) &&
14038         !_is_pending_removed_snap(m->pool, m->snapid)) {
14039       if (m->snapid > pp.get_snap_seq()) {
14040         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14041         return false;
14042       }
14043       pp.remove_unmanaged_snap(
14044         m->snapid,
14045         osdmap.require_osd_release < ceph_release_t::octopus);
14046       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14047       // also record the new seq as purged: this avoids a discontinuity
14048       // after all of the snaps have been purged, since the seq assigned
14049       // during removal lives in the same namespace as the actual snaps.
14050       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14051       changed = true;
14052     }
14053     break;
14054
14055   case POOL_OP_AUID_CHANGE:
14056     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14057     return false;
14058
14059   default:
14060     ceph_abort();
14061     break;
14062   }
14063
14064   if (changed) {
14065     pp.set_snap_epoch(pending_inc.epoch);
14066     pending_inc.new_pools[m->pool] = pp;
14067   }
14068
14069  out:
14070   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14071   return true;
14072 }
14073
14074 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14075 {
14076   op->mark_osdmon_event(__func__);
14077   int err = prepare_new_pool(op);
14078   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14079   return true;
14080 }
14081
14082 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14083                                    ostream *ss)
14084 {
14085   const string& poolstr = osdmap.get_pool_name(pool_id);
14086
14087   // If the Pool is in use by CephFS, refuse to delete it
14088   FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14089   if (pending_fsmap.pool_in_use(pool_id)) {
14090     *ss << "pool '" << poolstr << "' is in use by CephFS";
14091     return -EBUSY;
14092   }
14093
14094   if (pool.tier_of >= 0) {
14095     *ss << "pool '" << poolstr << "' is a tier of '"
14096         << osdmap.get_pool_name(pool.tier_of) << "'";
14097     return -EBUSY;
14098   }
14099   if (!pool.tiers.empty()) {
14100     *ss << "pool '" << poolstr << "' has tiers";
14101     for(auto tier : pool.tiers) {
14102       *ss << " " << osdmap.get_pool_name(tier);
14103     }
14104     return -EBUSY;
14105   }
14106
14107   if (!g_conf()->mon_allow_pool_delete) {
14108     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14109     return -EPERM;
14110   }
14111
14112   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14113     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14114     return -EPERM;
14115   }
14116
14117   *ss << "pool '" << poolstr << "' removed";
14118   return 0;
14119 }
14120
14121 /**
14122  * Check if it is safe to add a tier to a base pool
14123  *
14124  * @return
14125  * True if the operation should proceed, false if we should abort here
14126  * (abort doesn't necessarily mean error, could be idempotency)
14127  */
14128 bool OSDMonitor::_check_become_tier(
14129     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14130     const int64_t base_pool_id, const pg_pool_t *base_pool,
14131     int *err,
14132     ostream *ss) const
14133 {
14134   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14135   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14136
14137   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14138   if (pending_fsmap.pool_in_use(tier_pool_id)) {
14139     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14140     *err = -EBUSY;
14141     return false;
14142   }
14143
14144   if (base_pool->tiers.count(tier_pool_id)) {
14145     ceph_assert(tier_pool->tier_of == base_pool_id);
14146     *err = 0;
14147     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14148       << base_pool_name << "'";
14149     return false;
14150   }
14151
14152   if (base_pool->is_tier()) {
14153     *ss << "pool '" << base_pool_name << "' is already a tier of '"
14154       << osdmap.get_pool_name(base_pool->tier_of) << "', "
14155       << "multiple tiers are not yet supported.";
14156     *err = -EINVAL;
14157     return false;
14158   }
14159
14160   if (tier_pool->has_tiers()) {
14161     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14162     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14163          it != tier_pool->tiers.end(); ++it)
14164       *ss << "'" << osdmap.get_pool_name(*it) << "',";
14165     *ss << " multiple tiers are not yet supported.";
14166     *err = -EINVAL;
14167     return false;
14168   }
14169
14170   if (tier_pool->is_tier()) {
14171     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14172        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14173     *err = -EINVAL;
14174     return false;
14175   }
14176
14177   *err = 0;
14178   return true;
14179 }
14180
14181
14182 /**
14183  * Check if it is safe to remove a tier from this base pool
14184  *
14185  * @return
14186  * True if the operation should proceed, false if we should abort here
14187  * (abort doesn't necessarily mean error, could be idempotency)
14188  */
14189 bool OSDMonitor::_check_remove_tier(
14190     const int64_t base_pool_id, const pg_pool_t *base_pool,
14191     const pg_pool_t *tier_pool,
14192     int *err, ostream *ss) const
14193 {
14194   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14195
14196   // Apply CephFS-specific checks
14197   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14198   if (pending_fsmap.pool_in_use(base_pool_id)) {
14199     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14200       // If the underlying pool is erasure coded and does not allow EC
14201       // overwrites, we can't permit the removal of the replicated tier that
14202       // CephFS relies on to access it
14203       *ss << "pool '" << base_pool_name <<
14204           "' does not allow EC overwrites and is in use by CephFS"
14205           " via its tier";
14206       *err = -EBUSY;
14207       return false;
14208     }
14209
14210     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14211       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14212              "tier is still in use as a writeback cache.  Change the cache "
14213              "mode and flush the cache before removing it";
14214       *err = -EBUSY;
14215       return false;
14216     }
14217   }
14218
14219   *err = 0;
14220   return true;
14221 }
14222
14223 int OSDMonitor::_prepare_remove_pool(
14224   int64_t pool, ostream *ss, bool no_fake)
14225 {
14226   dout(10) << __func__ << " " << pool << dendl;
14227   const pg_pool_t *p = osdmap.get_pg_pool(pool);
14228   int r = _check_remove_pool(pool, *p, ss);
14229   if (r < 0)
14230     return r;
14231
14232   auto new_pool = pending_inc.new_pools.find(pool);
14233   if (new_pool != pending_inc.new_pools.end()) {
14234     // if there is a problem with the pending info, wait and retry
14235     // this op.
14236     const auto& p = new_pool->second;
14237     int r = _check_remove_pool(pool, p, ss);
14238     if (r < 0)
14239       return -EAGAIN;
14240   }
14241
14242   if (pending_inc.old_pools.count(pool)) {
14243     dout(10) << __func__ << " " << pool << " already pending removal"
14244              << dendl;
14245     return 0;
14246   }
14247
14248   if (g_conf()->mon_fake_pool_delete && !no_fake) {
14249     string old_name = osdmap.get_pool_name(pool);
14250     string new_name = old_name + "." + stringify(pool) + ".DELETED";
14251     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14252             << old_name << " -> " << new_name << dendl;
14253     pending_inc.new_pool_names[pool] = new_name;
14254     return 0;
14255   }
14256
14257   // remove
14258   pending_inc.old_pools.insert(pool);
14259
14260   // remove any pg_temp mappings for this pool
14261   for (auto p = osdmap.pg_temp->begin();
14262        p != osdmap.pg_temp->end();
14263        ++p) {
14264     if (p->first.pool() == pool) {
14265       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14266                << p->first << dendl;
14267       pending_inc.new_pg_temp[p->first].clear();
14268     }
14269   }
14270   // remove any primary_temp mappings for this pool
14271   for (auto p = osdmap.primary_temp->begin();
14272       p != osdmap.primary_temp->end();
14273       ++p) {
14274     if (p->first.pool() == pool) {
14275       dout(10) << __func__ << " " << pool
14276                << " removing obsolete primary_temp" << p->first << dendl;
14277       pending_inc.new_primary_temp[p->first] = -1;
14278     }
14279   }
14280   // remove any pg_upmap mappings for this pool
14281   for (auto& p : osdmap.pg_upmap) {
14282     if (p.first.pool() == pool) {
14283       dout(10) << __func__ << " " << pool
14284                << " removing obsolete pg_upmap "
14285                << p.first << dendl;
14286       pending_inc.old_pg_upmap.insert(p.first);
14287     }
14288   }
14289   // remove any pending pg_upmap mappings for this pool
14290   {
14291     auto it = pending_inc.new_pg_upmap.begin();
14292     while (it != pending_inc.new_pg_upmap.end()) {
14293       if (it->first.pool() == pool) {
14294         dout(10) << __func__ << " " << pool
14295                  << " removing pending pg_upmap "
14296                  << it->first << dendl;
14297         it = pending_inc.new_pg_upmap.erase(it);
14298       } else {
14299         it++;
14300       }
14301     }
14302   }
14303   // remove any pg_upmap_items mappings for this pool
14304   for (auto& p : osdmap.pg_upmap_items) {
14305     if (p.first.pool() == pool) {
14306       dout(10) << __func__ << " " << pool
14307                << " removing obsolete pg_upmap_items " << p.first
14308                << dendl;
14309       pending_inc.old_pg_upmap_items.insert(p.first);
14310     }
14311   }
14312   // remove any pending pg_upmap mappings for this pool
14313   {
14314     auto it = pending_inc.new_pg_upmap_items.begin();
14315     while (it != pending_inc.new_pg_upmap_items.end()) {
14316       if (it->first.pool() == pool) {
14317         dout(10) << __func__ << " " << pool
14318                  << " removing pending pg_upmap_items "
14319                  << it->first << dendl;
14320         it = pending_inc.new_pg_upmap_items.erase(it);
14321       } else {
14322         it++;
14323       }
14324     }
14325   }
14326
14327   // remove any choose_args for this pool
14328   CrushWrapper newcrush = _get_pending_crush();
14329   if (newcrush.have_choose_args(pool)) {
14330     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14331     newcrush.rm_choose_args(pool);
14332     pending_inc.crush.clear();
14333     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14334   }
14335   return 0;
14336 }
14337
14338 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14339 {
14340   dout(10) << "_prepare_rename_pool " << pool << dendl;
14341   if (pending_inc.old_pools.count(pool)) {
14342     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14343     return -ENOENT;
14344   }
14345   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14346        p != pending_inc.new_pool_names.end();
14347        ++p) {
14348     if (p->second == newname && p->first != pool) {
14349       return -EEXIST;
14350     }
14351   }
14352
14353   pending_inc.new_pool_names[pool] = newname;
14354   return 0;
14355 }
14356
14357 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14358 {
14359   op->mark_osdmon_event(__func__);
14360   auto m = op->get_req<MPoolOp>();
14361   ostringstream ss;
14362   int ret = _prepare_remove_pool(m->pool, &ss, false);
14363   if (ret == -EAGAIN) {
14364     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14365     return true;
14366   }
14367   if (ret < 0)
14368     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14369   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14370                                                       pending_inc.epoch));
14371   return true;
14372 }
14373
14374 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14375                                 int ret, epoch_t epoch, bufferlist *blp)
14376 {
14377   op->mark_osdmon_event(__func__);
14378   auto m = op->get_req<MPoolOp>();
14379   dout(20) << "_pool_op_reply " << ret << dendl;
14380   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14381                                          ret, epoch, get_last_committed(), blp);
14382   mon.send_reply(op, reply);
14383 }
14384
14385 void OSDMonitor::convert_pool_priorities(void)
14386 {
14387   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14388   int64_t max_prio = 0;
14389   int64_t min_prio = 0;
14390   for (const auto &i : osdmap.get_pools()) {
14391     const auto &pool = i.second;
14392
14393     if (pool.opts.is_set(key)) {
14394       int64_t prio = 0;
14395       pool.opts.get(key, &prio);
14396       if (prio > max_prio)
14397         max_prio = prio;
14398       if (prio < min_prio)
14399         min_prio = prio;
14400     }
14401   }
14402   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14403     dout(20) << __func__ << " nothing to fix" << dendl;
14404     return;
14405   }
14406   // Current pool priorities exceeds new maximum
14407   for (const auto &i : osdmap.get_pools()) {
14408     const auto pool_id = i.first;
14409     pg_pool_t pool = i.second;
14410
14411     int64_t prio = 0;
14412     pool.opts.get(key, &prio);
14413     int64_t n;
14414
14415     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14416       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14417       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14418     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14419       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14420       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14421     } else {
14422       continue;
14423     }
14424     if (n == 0) {
14425       pool.opts.unset(key);
14426     } else {
14427       pool.opts.set(key, static_cast<int64_t>(n));
14428     }
14429     dout(10) << __func__ << " pool " << pool_id
14430              << " recovery_priority adjusted "
14431              << prio << " to " << n << dendl;
14432     pool.last_change = pending_inc.epoch;
14433     pending_inc.new_pools[pool_id] = pool;
14434   }
14435 }
14436
14437 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14438                                                int *errcode,
14439                                                set<pg_pool_t*>* pools,
14440                                                const string& new_crush_rule)
14441 {
14442   dout(20) << __func__ << dendl;
14443   *okay = false;
14444   int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14445   if (new_crush_rule_result < 0) {
14446     ss << "unrecognized crush rule " << new_crush_rule_result;
14447     *errcode = new_crush_rule_result;
14448     return;
14449   }
14450   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14451   for (const auto& pooli : osdmap.pools) {
14452     int64_t poolid = pooli.first;
14453     const pg_pool_t *p = &pooli.second;
14454     if (!p->is_replicated()) {
14455       ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14456       *errcode = -EINVAL;
14457       return;
14458     }
14459     uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14460     if ((p->get_size() != default_size ||
14461          (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14462         (p->get_crush_rule() != new_rule)) {
14463       ss << "we currently require stretch mode pools start out with the"
14464         " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14465       *errcode = -EINVAL;
14466       return;
14467     }
14468     pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14469     // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14470     // the attempt may fail and then we have these pool updates...but they won't do anything
14471     // if there is a failure, so if it's hard to change the interface, no need to bother
14472     pools->insert(pp);
14473   }
14474   *okay = true;
14475   return;
14476 }
14477
14478 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14479                                          int *errcode, bool commit,
14480                                          const string& dividing_bucket,
14481                                          uint32_t bucket_count,
14482                                          const set<pg_pool_t*>& pools,
14483                                          const string& new_crush_rule)
14484 {
14485   dout(20) << __func__ << dendl;
14486   *okay = false;
14487   CrushWrapper crush = _get_pending_crush();
14488   int dividing_id = -1;
14489   if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14490       !type_id.has_value()) {
14491     ss << dividing_bucket << " is not a valid crush bucket type";
14492     *errcode = -ENOENT;
14493     ceph_assert(!commit);
14494     return;
14495   } else {
14496     dividing_id = *type_id;
14497   }
14498   vector<int> subtrees;
14499   crush.get_subtree_of_type(dividing_id, &subtrees);
14500   if (subtrees.size() != 2) {
14501     ss << "there are " << subtrees.size() << dividing_bucket
14502        << "'s in the cluster but stretch mode currently only works with 2!";
14503     *errcode = -EINVAL;
14504     ceph_assert(!commit || subtrees.size() == 2);
14505     return;
14506   }
14507
14508   int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14509   if (new_crush_rule_result < 0) {
14510     ss << "unrecognized crush rule " << new_crush_rule;
14511     *errcode = new_crush_rule_result;
14512     ceph_assert(!commit || (new_crush_rule_result > 0));
14513     return;
14514   }
14515   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14516
14517   int weight1 = crush.get_item_weight(subtrees[0]);
14518   int weight2 = crush.get_item_weight(subtrees[1]);
14519   if (weight1 != weight2) {
14520     // TODO: I'm really not sure this is a good idea?
14521     ss << "the 2 " << dividing_bucket
14522        << "instances in the cluster have differing weights "
14523        << weight1 << " and " << weight2
14524        <<" but stretch mode currently requires they be the same!";
14525     *errcode = -EINVAL;
14526     ceph_assert(!commit || (weight1 == weight2));
14527     return;
14528   }
14529   if (bucket_count != 2) {
14530     ss << "currently we only support 2-site stretch clusters!";
14531     *errcode = -EINVAL;
14532     ceph_assert(!commit || bucket_count == 2);
14533     return;
14534   }
14535   // TODO: check CRUSH rules for pools so that we are appropriately divided
14536   if (commit) {
14537     for (auto pool : pools) {
14538       pool->crush_rule = new_rule;
14539       pool->peering_crush_bucket_count = bucket_count;
14540       pool->peering_crush_bucket_target = bucket_count;
14541       pool->peering_crush_bucket_barrier = dividing_id;
14542       pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14543       pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14544       pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14545     }
14546     pending_inc.change_stretch_mode = true;
14547     pending_inc.stretch_mode_enabled = true;
14548     pending_inc.new_stretch_bucket_count = bucket_count;
14549     pending_inc.new_degraded_stretch_mode = 0;
14550     pending_inc.new_stretch_mode_bucket = dividing_id;
14551   }
14552   *okay = true;
14553   return;
14554 }
14555
14556 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14557                                             set<int> *really_down_buckets,
14558                                             set<string> *really_down_mons)
14559 {
14560   dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14561   ceph_assert(is_readable());
14562   if (dead_buckets.empty()) return false;
14563   set<int> down_cache;
14564   bool really_down = false;
14565   for (auto dbi : dead_buckets) {
14566     const string& bucket_name = dbi.first;
14567     ceph_assert(osdmap.crush->name_exists(bucket_name));
14568     int bucket_id = osdmap.crush->get_item_id(bucket_name);
14569     dout(20) << "Checking " << bucket_name << " id " << bucket_id
14570              << " to see if OSDs are also down" << dendl;
14571     bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14572     if (subtree_down) {
14573       dout(20) << "subtree is down!" << dendl;
14574       really_down = true;
14575       really_down_buckets->insert(bucket_id);
14576       really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14577     }
14578   }
14579   dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14580            << " and mons " << *really_down_mons << " are really down" << dendl;
14581   return really_down;
14582 }
14583
14584 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14585                                                const set<string>& live_zones)
14586 {
14587   dout(20) << __func__ << dendl;
14588   stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14589   // update the general OSDMap changes
14590   pending_inc.change_stretch_mode = true;
14591   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14592   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14593   int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14594   ceph_assert(new_site_count == 1); // stretch count 2!
14595   pending_inc.new_degraded_stretch_mode = new_site_count;
14596   pending_inc.new_recovering_stretch_mode = 0;
14597   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14598
14599   // and then apply them to all the pg_pool_ts
14600   ceph_assert(live_zones.size() == 1); // only support 2 zones now
14601   const string& remaining_site_name = *(live_zones.begin());
14602   ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14603   int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14604   for (auto pgi : osdmap.pools) {
14605     if (pgi.second.peering_crush_bucket_count) {
14606       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14607       newp.peering_crush_bucket_count = new_site_count;
14608       newp.peering_crush_mandatory_member = remaining_site;
14609       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14610       newp.last_force_op_resend = pending_inc.epoch;
14611     }
14612   }
14613   propose_pending();
14614 }
14615
14616 void OSDMonitor::trigger_recovery_stretch_mode()
14617 {
14618   dout(20) << __func__ << dendl;
14619   stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14620   pending_inc.change_stretch_mode = true;
14621   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14622   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14623   pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14624   pending_inc.new_recovering_stretch_mode = 1;
14625   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14626
14627   for (auto pgi : osdmap.pools) {
14628     if (pgi.second.peering_crush_bucket_count) {
14629       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14630       newp.last_force_op_resend = pending_inc.epoch;
14631     }
14632   }
14633   propose_pending();
14634 }
14635
14636 void OSDMonitor::set_degraded_stretch_mode()
14637 {
14638   stretch_recovery_triggered.set_from_double(0);
14639 }
14640
14641 void OSDMonitor::set_recovery_stretch_mode()
14642 {
14643   if (stretch_recovery_triggered.is_zero()) {
14644     stretch_recovery_triggered = ceph_clock_now();
14645   }
14646 }
14647
14648 void OSDMonitor::set_healthy_stretch_mode()
14649 {
14650   stretch_recovery_triggered.set_from_double(0);
14651 }
14652
14653 void OSDMonitor::notify_new_pg_digest()
14654 {
14655   dout(20) << __func__ << dendl;
14656   if (!stretch_recovery_triggered.is_zero()) {
14657     try_end_recovery_stretch_mode(false);
14658   }
14659 }
14660
14661 struct CMonExitRecovery : public Context {
14662   OSDMonitor *m;
14663   bool force;
14664   CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14665   void finish(int r) {
14666     m->try_end_recovery_stretch_mode(force);
14667   }
14668 };
14669
14670 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14671 {
14672   dout(20) << __func__ << dendl;
14673   if (!mon.is_leader()) return;
14674   if (!mon.is_degraded_stretch_mode()) return;
14675   if (!mon.is_recovering_stretch_mode()) return;
14676   if (!is_readable()) {
14677     wait_for_readable_ctx(new CMonExitRecovery(this, force));
14678     return;
14679   }
14680
14681   if (osdmap.recovering_stretch_mode &&
14682       ((!stretch_recovery_triggered.is_zero() &&
14683         ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14684         stretch_recovery_triggered) ||
14685        force)) {
14686     if (!mon.mgrstatmon()->is_readable()) {
14687       mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14688       return;
14689     }
14690     const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14691     double misplaced, degraded, inactive, unknown;
14692     pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14693     if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14694       // we can exit degraded stretch mode!
14695       mon.trigger_healthy_stretch_mode();
14696     }
14697   }
14698 }
14699
14700 void OSDMonitor::trigger_healthy_stretch_mode()
14701 {
14702   ceph_assert(is_writeable());
14703   stretch_recovery_triggered.set_from_double(0);
14704   pending_inc.change_stretch_mode = true;
14705   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14706   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14707   pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14708   pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14709   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14710   for (auto pgi : osdmap.pools) {
14711     if (pgi.second.peering_crush_bucket_count) {
14712       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14713       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14714       newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14715       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14716       newp.last_force_op_resend = pending_inc.epoch;
14717     }
14718   }
14719   propose_pending();
14720 }