ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/KVMonitor.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 using std::dec;
  95 using std::hex;
  96 using std::list;
  97 using std::map;
  98 using std::make_pair;
  99 using std::ostringstream;
 100 using std::pair;
 101 using std::set;
 102 using std::string;
 103 using std::stringstream;
 104 using std::to_string;
 105 using std::vector;
 106
 107 using ceph::bufferlist;
 108 using ceph::decode;
 109 using ceph::encode;
 110 using ceph::ErasureCodeInterfaceRef;
 111 using ceph::ErasureCodePluginRegistry;
 112 using ceph::ErasureCodeProfile;
 113 using ceph::Formatter;
 114 using ceph::JSONFormatter;
 115 using ceph::make_message;
 116
 117 #define dout_subsys ceph_subsys_mon
 118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
 119 static const string OSD_METADATA_PREFIX("osd_metadata");
 120 static const string OSD_SNAP_PREFIX("osd_snap");
 121
 122 /*
 123
 124   OSD snapshot metadata
 125   ---------------------
 126
 127   -- starting with mimic, removed in octopus --
 128
 129   "removed_epoch_%llu_%08lx" % (pool, epoch)
 130    -> interval_set<snapid_t>
 131
 132   "removed_snap_%llu_%016llx" % (pool, last_snap)
 133    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 134
 135
 136   -- starting with mimic --
 137
 138   "purged_snap_%llu_%016llx" % (pool, last_snap)
 139    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 140
 141   - note that the {removed,purged}_snap put the last snap in they key so
 142     that we can use forward iteration only to search for an epoch in an
 143     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 144     >= N that either does or doesn't contain the given snap.
 145
 146
 147   -- starting with octopus --
 148
 149   "purged_epoch_%08lx" % epoch
 150   -> map<int64_t,interval_set<snapid_t>>
 151
 152   */
 153 using namespace TOPNSPC::common;
 154 namespace {
 155
 156 struct OSDMemCache : public PriorityCache::PriCache {
 157   OSDMonitor *osdmon;
 158   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 159   int64_t committed_bytes = 0;
 160   double cache_ratio = 0;
 161
 162   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 163
 164   virtual uint64_t _get_used_bytes() const = 0;
 165
 166   virtual int64_t request_cache_bytes(
 167       PriorityCache::Priority pri, uint64_t total_cache) const {
 168     int64_t assigned = get_cache_bytes(pri);
 169
 170     switch (pri) {
 171     // All cache items are currently set to have PRI1 priority
 172     case PriorityCache::Priority::PRI1:
 173       {
 174         int64_t request = _get_used_bytes();
 175         return (request > assigned) ? request - assigned : 0;
 176       }
 177     default:
 178       break;
 179     }
 180     return -EOPNOTSUPP;
 181   }
 182
 183   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 184       return cache_bytes[pri];
 185   }
 186
 187   virtual int64_t get_cache_bytes() const {
 188     int64_t total = 0;
 189
 190     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 191       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 192       total += get_cache_bytes(pri);
 193     }
 194     return total;
 195   }
 196
 197   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 198     cache_bytes[pri] = bytes;
 199   }
 200   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 201     cache_bytes[pri] += bytes;
 202   }
 203   virtual int64_t commit_cache_size(uint64_t total_cache) {
 204     committed_bytes = PriorityCache::get_chunk(
 205         get_cache_bytes(), total_cache);
 206     return committed_bytes;
 207   }
 208   virtual int64_t get_committed_size() const {
 209     return committed_bytes;
 210   }
 211   virtual double get_cache_ratio() const {
 212     return cache_ratio;
 213   }
 214   virtual void set_cache_ratio(double ratio) {
 215     cache_ratio = ratio;
 216   }
 217   virtual void shift_bins() {
 218   }
 219   virtual void import_bins(const std::vector<uint64_t> &bins) {
 220   }
 221   virtual void set_bins(PriorityCache::Priority pri, uint64_t end_bin) {
 222   }
 223   virtual uint64_t get_bins(PriorityCache::Priority pri) const {
 224     return 0;
 225   }
 226
 227   virtual string get_cache_name() const = 0;
 228 };
 229
 230 struct IncCache : public OSDMemCache {
 231   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 232
 233   virtual uint64_t _get_used_bytes() const {
 234     return osdmon->inc_osd_cache.get_bytes();
 235   }
 236
 237   virtual string get_cache_name() const {
 238     return "OSDMap Inc Cache";
 239   }
 240
 241   uint64_t _get_num_osdmaps() const {
 242     return osdmon->inc_osd_cache.get_size();
 243   }
 244 };
 245
 246 struct FullCache : public OSDMemCache {
 247   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 248
 249   virtual uint64_t _get_used_bytes() const {
 250     return osdmon->full_osd_cache.get_bytes();
 251   }
 252
 253   virtual string get_cache_name() const {
 254     return "OSDMap Full Cache";
 255   }
 256
 257   uint64_t _get_num_osdmaps() const {
 258     return osdmon->full_osd_cache.get_size();
 259   }
 260 };
 261
 262 std::shared_ptr<IncCache> inc_cache;
 263 std::shared_ptr<FullCache> full_cache;
 264
 265 const uint32_t MAX_POOL_APPLICATIONS = 4;
 266 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 267 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 268
 269 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 270   // Note: this doesn't include support for the application tag match
 271   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 272     auto& match = grant.match;
 273     if (match.is_match_all()) {
 274       return true;
 275     } else if (pool_name != nullptr &&
 276                !match.pool_namespace.pool_name.empty() &&
 277                match.pool_namespace.pool_name == *pool_name) {
 278       return true;
 279     }
 280   }
 281   return false;
 282 }
 283
 284 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 285                                     const KeyServer& key_server,
 286                                     const EntityName& entity_name,
 287                                     const MonCap& mon_caps,
 288                                     const entity_addr_t& peer_socket_addr,
 289                                     const std::string* pool_name)
 290 {
 291   typedef std::map<std::string, std::string> CommandArgs;
 292
 293   if (mon_caps.is_capable(
 294         cct, entity_name, "osd",
 295         "osd pool op unmanaged-snap",
 296         (pool_name == nullptr ?
 297          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 298          CommandArgs{{"poolname", *pool_name}}),
 299         false, true, false,
 300         peer_socket_addr)) {
 301     return true;
 302   }
 303
 304   AuthCapsInfo caps_info;
 305   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 306                                    caps_info)) {
 307     dout(10) << "unable to locate OSD cap data for " << entity_name
 308              << " in auth db" << dendl;
 309     return false;
 310   }
 311
 312   string caps_str;
 313   if (caps_info.caps.length() > 0) {
 314     auto p = caps_info.caps.cbegin();
 315     try {
 316       decode(caps_str, p);
 317     } catch (const ceph::buffer::error &err) {
 318       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 319            << dendl;
 320       return false;
 321     }
 322   }
 323
 324   OSDCap osd_cap;
 325   if (!osd_cap.parse(caps_str, nullptr)) {
 326     dout(10) << "unable to parse OSD cap data for " << entity_name
 327              << " in auth db" << dendl;
 328     return false;
 329   }
 330
 331   // if the entity has write permissions in one or all pools, permit
 332   // usage of unmanaged-snapshots
 333   if (osd_cap.allow_all()) {
 334     return true;
 335   }
 336
 337   for (auto& grant : osd_cap.grants) {
 338     if (grant.profile.is_valid()) {
 339       for (auto& profile_grant : grant.profile_grants) {
 340         if (is_osd_writable(profile_grant, pool_name)) {
 341           return true;
 342         }
 343       }
 344     } else if (is_osd_writable(grant, pool_name)) {
 345       return true;
 346     }
 347   }
 348
 349   return false;
 350 }
 351
 352 } // anonymous namespace
 353
 354 void LastEpochClean::Lec::report(unsigned pg_num, ps_t ps,
 355                                  epoch_t last_epoch_clean)
 356 {
 357   if (ps >= pg_num) {
 358     // removed PG
 359     return;
 360   }
 361   epoch_by_pg.resize(pg_num, 0);
 362   const auto old_lec = epoch_by_pg[ps];
 363   if (old_lec >= last_epoch_clean) {
 364     // stale lec
 365     return;
 366   }
 367   epoch_by_pg[ps] = last_epoch_clean;
 368   if (last_epoch_clean < floor) {
 369     floor = last_epoch_clean;
 370   } else if (last_epoch_clean > floor) {
 371     if (old_lec == floor) {
 372       // probably should increase floor?
 373       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 374                                         std::end(epoch_by_pg));
 375       floor = *new_floor;
 376     }
 377   }
 378   if (ps != next_missing) {
 379     return;
 380   }
 381   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 382     if (epoch_by_pg[next_missing] == 0) {
 383       break;
 384     }
 385   }
 386 }
 387
 388 void LastEpochClean::remove_pool(uint64_t pool)
 389 {
 390   report_by_pool.erase(pool);
 391 }
 392
 393 void LastEpochClean::report(unsigned pg_num, const pg_t& pg,
 394                             epoch_t last_epoch_clean)
 395 {
 396   auto& lec = report_by_pool[pg.pool()];
 397   return lec.report(pg_num, pg.ps(), last_epoch_clean);
 398 }
 399
 400 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 401 {
 402   auto floor = latest.get_epoch();
 403   for (auto& pool : latest.get_pools()) {
 404     auto reported = report_by_pool.find(pool.first);
 405     if (reported == report_by_pool.end()) {
 406       return 0;
 407     }
 408     if (reported->second.next_missing < pool.second.get_pg_num()) {
 409       return 0;
 410     }
 411     if (reported->second.floor < floor) {
 412       floor = reported->second.floor;
 413     }
 414   }
 415   return floor;
 416 }
 417
 418 void LastEpochClean::dump(Formatter *f) const
 419 {
 420   f->open_array_section("per_pool");
 421
 422   for (auto& [pool, lec] : report_by_pool) {
 423     f->open_object_section("pool");
 424     f->dump_unsigned("poolid", pool);
 425     f->dump_unsigned("floor", lec.floor);
 426     f->close_section();
 427   }
 428
 429   f->close_section();
 430 }
 431
 432 class C_UpdateCreatingPGs : public Context {
 433 public:
 434   OSDMonitor *osdmon;
 435   utime_t start;
 436   epoch_t epoch;
 437   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 438     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 439   void finish(int r) override {
 440     if (r >= 0) {
 441       utime_t end = ceph_clock_now();
 442       dout(10) << "osdmap epoch " << epoch << " mapping took "
 443                << (end - start) << " seconds" << dendl;
 444       osdmon->update_creating_pgs();
 445       osdmon->check_pg_creates_subs();
 446     }
 447   }
 448 };
 449
 450 #undef dout_prefix
 451 #define dout_prefix _prefix(_dout, mon, osdmap)
 452 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
 453   return *_dout << "mon." << mon.name << "@" << mon.rank
 454                 << "(" << mon.get_state_name()
 455                 << ").osd e" << osdmap.get_epoch() << " ";
 456 }
 457
 458 OSDMonitor::OSDMonitor(
 459   CephContext *cct,
 460   Monitor &mn,
 461   Paxos &p,
 462   const string& service_name)
 463  : PaxosService(mn, p, service_name),
 464    cct(cct),
 465    inc_osd_cache(g_conf()->mon_osd_cache_size),
 466    full_osd_cache(g_conf()->mon_osd_cache_size),
 467    has_osdmap_manifest(false),
 468    mapper(mn.cct, &mn.cpu_tp)
 469 {
 470   inc_cache = std::make_shared<IncCache>(this);
 471   full_cache = std::make_shared<FullCache>(this);
 472   cct->_conf.add_observer(this);
 473   int r = _set_cache_sizes();
 474   if (r < 0) {
 475     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 476          << g_conf()->mon_osd_cache_size
 477          << ") without priority cache management"
 478          << dendl;
 479   }
 480 }
 481
 482 const char **OSDMonitor::get_tracked_conf_keys() const
 483 {
 484   static const char* KEYS[] = {
 485     "mon_memory_target",
 486     "mon_memory_autotune",
 487     "rocksdb_cache_size",
 488     NULL
 489   };
 490   return KEYS;
 491 }
 492
 493 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 494                                     const std::set<std::string> &changed)
 495 {
 496   dout(10) << __func__ << " " << changed << dendl;
 497
 498   if (changed.count("mon_memory_autotune")) {
 499     _set_cache_autotuning();
 500   }
 501   if (changed.count("mon_memory_target") ||
 502       changed.count("rocksdb_cache_size")) {
 503     int r = _update_mon_cache_settings();
 504     if (r < 0) {
 505       derr << __func__ << " mon_memory_target:"
 506            << g_conf()->mon_memory_target
 507            << " rocksdb_cache_size:"
 508            << g_conf()->rocksdb_cache_size
 509            << ". Unable to update cache size."
 510            << dendl;
 511     }
 512   }
 513 }
 514
 515 void OSDMonitor::_set_cache_autotuning()
 516 {
 517   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 518     // Disable cache autotuning
 519     std::lock_guard l(balancer_lock);
 520     pcm = nullptr;
 521   }
 522
 523   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 524     int r = register_cache_with_pcm();
 525     if (r < 0) {
 526       dout(10) << __func__
 527                << " Error while registering osdmon caches with pcm."
 528                << " Cache auto tuning not enabled."
 529                << dendl;
 530       mon_memory_autotune = false;
 531     } else {
 532       mon_memory_autotune = true;
 533     }
 534   }
 535 }
 536
 537 int OSDMonitor::_update_mon_cache_settings()
 538 {
 539   if (g_conf()->mon_memory_target <= 0 ||
 540       g_conf()->mon_memory_target < mon_memory_min ||
 541       g_conf()->rocksdb_cache_size <= 0) {
 542     return -EINVAL;
 543   }
 544
 545   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 546     derr << __func__ << " not using pcm and rocksdb" << dendl;
 547     return -EINVAL;
 548   }
 549
 550   uint64_t old_mon_memory_target = mon_memory_target;
 551   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 552
 553   // Set the new pcm memory cache sizes
 554   mon_memory_target = g_conf()->mon_memory_target;
 555   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 556
 557   uint64_t base = mon_memory_base;
 558   double fragmentation = mon_memory_fragmentation;
 559   uint64_t target = mon_memory_target;
 560   uint64_t min = mon_memory_min;
 561   uint64_t max = min;
 562
 563   uint64_t ltarget = (1.0 - fragmentation) * target;
 564   if (ltarget > base + min) {
 565     max = ltarget - base;
 566   }
 567
 568   int r = _set_cache_ratios();
 569   if (r < 0) {
 570     derr << __func__ << " Cache ratios for pcm could not be set."
 571          << " Review the kv (rocksdb) and mon_memory_target sizes."
 572          << dendl;
 573     mon_memory_target = old_mon_memory_target;
 574     rocksdb_cache_size = old_rocksdb_cache_size;
 575     return -EINVAL;
 576   }
 577
 578   if (mon_memory_autotune && pcm != nullptr) {
 579     std::lock_guard l(balancer_lock);
 580     // set pcm cache levels
 581     pcm->set_target_memory(target);
 582     pcm->set_min_memory(min);
 583     pcm->set_max_memory(max);
 584     // tune memory based on new values
 585     pcm->tune_memory();
 586     pcm->balance();
 587     _set_new_cache_sizes();
 588     dout(1) << __func__ << " Updated mon cache setting."
 589              << " target: " << target
 590              << " min: " << min
 591              << " max: " << max
 592              << dendl;
 593   }
 594   return 0;
 595 }
 596
 597 int OSDMonitor::_set_cache_sizes()
 598 {
 599   if (g_conf()->mon_memory_autotune) {
 600     // set the new osdmon cache targets to be managed by pcm
 601     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 602     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 603     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 604     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 605     mon_memory_target = g_conf()->mon_memory_target;
 606     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 607     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 608       derr << __func__ << " mon_memory_target:" << mon_memory_target
 609            << " mon_memory_min:" << mon_memory_min
 610            << ". Invalid size option(s) provided."
 611            << dendl;
 612       return -EINVAL;
 613     }
 614     // Set the initial inc and full LRU cache sizes
 615     inc_osd_cache.set_bytes(mon_memory_min);
 616     full_osd_cache.set_bytes(mon_memory_min);
 617     mon_memory_autotune = g_conf()->mon_memory_autotune;
 618   }
 619   return 0;
 620 }
 621
 622 bool OSDMonitor::_have_pending_crush()
 623 {
 624   return pending_inc.crush.length() > 0;
 625 }
 626
 627 CrushWrapper &OSDMonitor::_get_stable_crush()
 628 {
 629   return *osdmap.crush;
 630 }
 631
 632 CrushWrapper OSDMonitor::_get_pending_crush()
 633 {
 634   bufferlist bl;
 635   if (pending_inc.crush.length())
 636     bl = pending_inc.crush;
 637   else
 638     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 639
 640   auto p = bl.cbegin();
 641   CrushWrapper crush;
 642   crush.decode(p);
 643   return crush;
 644 }
 645
 646 void OSDMonitor::create_initial()
 647 {
 648   dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
 649
 650   OSDMap newmap;
 651
 652   bufferlist bl;
 653   mon.store->get("mkfs", "osdmap", bl);
 654
 655   if (bl.length()) {
 656     newmap.decode(bl);
 657     newmap.set_fsid(mon.monmap->fsid);
 658   } else {
 659     newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
 660   }
 661   newmap.set_epoch(1);
 662   newmap.created = newmap.modified = ceph_clock_now();
 663
 664   // new clusters should sort bitwise by default.
 665   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 666
 667   newmap.flags |=
 668     CEPH_OSDMAP_RECOVERY_DELETES |
 669     CEPH_OSDMAP_PURGED_SNAPDIRS |
 670     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 671   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 672   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 673   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 674   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 675   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 676   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 677
 678   // new cluster should require latest by default
 679   if (g_conf().get_val<bool>("mon_debug_no_require_quincy")) {
 680     if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
 681       derr << __func__ << " mon_debug_no_require_quincy and pacific=true" << dendl;
 682       newmap.require_osd_release = ceph_release_t::nautilus;
 683     } else {
 684       derr << __func__ << " mon_debug_no_require_quincy=true" << dendl;
 685       newmap.require_osd_release = ceph_release_t::pacific;
 686     }
 687   } else {
 688     newmap.require_osd_release = ceph_release_t::quincy;
 689   }
 690
 691   ceph_release_t r = ceph_release_from_name(g_conf()->mon_osd_initial_require_min_compat_client);
 692   if (!r) {
 693     ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 694   }
 695   newmap.require_min_compat_client = r;
 696
 697   // encode into pending incremental
 698   uint64_t features = newmap.get_encoding_features();
 699   newmap.encode(pending_inc.fullmap,
 700                 features | CEPH_FEATURE_RESERVED);
 701   pending_inc.full_crc = newmap.get_crc();
 702   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 703 }
 704
 705 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 706 {
 707   s.insert(service_name);
 708   s.insert(OSD_PG_CREATING_PREFIX);
 709   s.insert(OSD_METADATA_PREFIX);
 710   s.insert(OSD_SNAP_PREFIX);
 711 }
 712
 713 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 714 {
 715   // we really don't care if the version has been updated, because we may
 716   // have trimmed without having increased the last committed; yet, we may
 717   // need to update the in-memory manifest.
 718   load_osdmap_manifest();
 719
 720   version_t version = get_last_committed();
 721   if (version == osdmap.epoch)
 722     return;
 723   ceph_assert(version > osdmap.epoch);
 724
 725   dout(15) << "update_from_paxos paxos e " << version
 726            << ", my e " << osdmap.epoch << dendl;
 727
 728   int prev_num_up_osd = osdmap.num_up_osd;
 729
 730   if (mapping_job) {
 731     if (!mapping_job->is_done()) {
 732       dout(1) << __func__ << " mapping job "
 733               << mapping_job.get() << " did not complete, "
 734               << mapping_job->shards << " left, canceling" << dendl;
 735       mapping_job->abort();
 736     }
 737     mapping_job.reset();
 738   }
 739
 740   load_health();
 741
 742   /*
 743    * We will possibly have a stashed latest that *we* wrote, and we will
 744    * always be sure to have the oldest full map in the first..last range
 745    * due to encode_trim_extra(), which includes the oldest full map in the trim
 746    * transaction.
 747    *
 748    * encode_trim_extra() does not however write the full map's
 749    * version to 'full_latest'.  This is only done when we are building the
 750    * full maps from the incremental versions.  But don't panic!  We make sure
 751    * that the following conditions find whichever full map version is newer.
 752    */
 753   version_t latest_full = get_version_latest_full();
 754   if (latest_full == 0 && get_first_committed() > 1)
 755     latest_full = get_first_committed();
 756
 757   if (get_first_committed() > 1 &&
 758       latest_full < get_first_committed()) {
 759     // the monitor could be just sync'ed with its peer, and the latest_full key
 760     // is not encoded in the paxos commits in encode_pending(), so we need to
 761     // make sure we get it pointing to a proper version.
 762     version_t lc = get_last_committed();
 763     version_t fc = get_first_committed();
 764
 765     dout(10) << __func__ << " looking for valid full map in interval"
 766              << " [" << fc << ", " << lc << "]" << dendl;
 767
 768     latest_full = 0;
 769     for (version_t v = lc; v >= fc; v--) {
 770       string full_key = "full_" + stringify(v);
 771       if (mon.store->exists(get_service_name(), full_key)) {
 772         dout(10) << __func__ << " found latest full map v " << v << dendl;
 773         latest_full = v;
 774         break;
 775       }
 776     }
 777
 778     ceph_assert(latest_full > 0);
 779     auto t(std::make_shared<MonitorDBStore::Transaction>());
 780     put_version_latest_full(t, latest_full);
 781     mon.store->apply_transaction(t);
 782     dout(10) << __func__ << " updated the on-disk full map version to "
 783              << latest_full << dendl;
 784   }
 785
 786   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 787     bufferlist latest_bl;
 788     get_version_full(latest_full, latest_bl);
 789     ceph_assert(latest_bl.length() != 0);
 790     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 791     osdmap = OSDMap();
 792     osdmap.decode(latest_bl);
 793   }
 794
 795   bufferlist bl;
 796   if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 797     auto p = bl.cbegin();
 798     std::lock_guard<std::mutex> l(creating_pgs_lock);
 799     creating_pgs.decode(p);
 800     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 801             << creating_pgs.last_scan_epoch
 802             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 803   } else {
 804     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 805             << dendl;
 806   }
 807
 808   // walk through incrementals
 809   MonitorDBStore::TransactionRef t;
 810   size_t tx_size = 0;
 811   while (version > osdmap.epoch) {
 812     bufferlist inc_bl;
 813     int err = get_version(osdmap.epoch+1, inc_bl);
 814     ceph_assert(err == 0);
 815     ceph_assert(inc_bl.length());
 816     // set priority cache manager levels if the osdmap is
 817     // being populated for the first time.
 818     if (mon_memory_autotune && pcm == nullptr) {
 819       int r = register_cache_with_pcm();
 820       if (r < 0) {
 821         dout(10) << __func__
 822                  << " Error while registering osdmon caches with pcm."
 823                  << " Proceeding without cache auto tuning."
 824                  << dendl;
 825       }
 826     }
 827
 828     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 829             << dendl;
 830     OSDMap::Incremental inc(inc_bl);
 831     err = osdmap.apply_incremental(inc);
 832     ceph_assert(err == 0);
 833
 834     if (!t)
 835       t.reset(new MonitorDBStore::Transaction);
 836
 837     // Write out the full map for all past epochs.  Encode the full
 838     // map with the same features as the incremental.  If we don't
 839     // know, use the quorum features.  If we don't know those either,
 840     // encode with all features.
 841     uint64_t f = inc.encode_features;
 842     if (!f)
 843       f = mon.get_quorum_con_features();
 844     if (!f)
 845       f = -1;
 846     bufferlist full_bl;
 847     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 848     tx_size += full_bl.length();
 849
 850     bufferlist orig_full_bl;
 851     get_version_full(osdmap.epoch, orig_full_bl);
 852     if (orig_full_bl.length()) {
 853       // the primary provided the full map
 854       ceph_assert(inc.have_crc);
 855       if (inc.full_crc != osdmap.crc) {
 856         // This will happen if the mons were running mixed versions in
 857         // the past or some other circumstance made the full encoded
 858         // maps divergent.  Reloading here will bring us back into
 859         // sync with the primary for this and all future maps.  OSDs
 860         // will also be brought back into sync when they discover the
 861         // crc mismatch and request a full map from a mon.
 862         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 863              << dendl;
 864
 865         dout(20) << __func__ << " my (bad) full osdmap:\n";
 866         JSONFormatter jf(true);
 867         jf.dump_object("osdmap", osdmap);
 868         jf.flush(*_dout);
 869         *_dout << "\nhexdump:\n";
 870         full_bl.hexdump(*_dout);
 871         *_dout << dendl;
 872
 873         osdmap = OSDMap();
 874         osdmap.decode(orig_full_bl);
 875
 876         dout(20) << __func__ << " canonical full osdmap:\n";
 877         JSONFormatter jf(true);
 878         jf.dump_object("osdmap", osdmap);
 879         jf.flush(*_dout);
 880         *_dout << "\nhexdump:\n";
 881         orig_full_bl.hexdump(*_dout);
 882         *_dout << dendl;
 883       }
 884     } else {
 885       ceph_assert(!inc.have_crc);
 886       put_version_full(t, osdmap.epoch, full_bl);
 887     }
 888     put_version_latest_full(t, osdmap.epoch);
 889
 890     // share
 891     dout(1) << osdmap << dendl;
 892
 893     if (osdmap.epoch == 1) {
 894       t->erase("mkfs", "osdmap");
 895     }
 896
 897     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 898       mon.store->apply_transaction(t);
 899       t = MonitorDBStore::TransactionRef();
 900       tx_size = 0;
 901     }
 902     for (const auto [osd, state] : inc.new_state) {
 903       if (state & CEPH_OSD_UP) {
 904         // could be marked up *or* down, but we're too lazy to check which
 905         last_osd_report.erase(osd);
 906       }
 907     }
 908     for (const auto [osd, weight] : inc.new_weight) {
 909       if (weight == CEPH_OSD_OUT) {
 910         // manually marked out, so drop it
 911         osd_epochs.erase(osd);
 912       }
 913     }
 914   }
 915
 916   if (t) {
 917     mon.store->apply_transaction(t);
 918   }
 919
 920   bool marked_osd_down = false;
 921   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 922     if (osdmap.is_out(o))
 923       continue;
 924     auto found = down_pending_out.find(o);
 925     if (osdmap.is_down(o)) {
 926       // populate down -> out map
 927       if (found == down_pending_out.end()) {
 928         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 929         down_pending_out[o] = ceph_clock_now();
 930         marked_osd_down = true;
 931       }
 932     } else {
 933       if (found != down_pending_out.end()) {
 934         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 935         down_pending_out.erase(found);
 936       }
 937     }
 938   }
 939   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 940
 941   check_osdmap_subs();
 942   check_pg_creates_subs();
 943
 944   share_map_with_random_osd();
 945   update_logger();
 946   process_failures();
 947
 948   // make sure our feature bits reflect the latest map
 949   update_msgr_features();
 950
 951   if (!mon.is_leader()) {
 952     // will be called by on_active() on the leader, avoid doing so twice
 953     start_mapping();
 954   }
 955   if (osdmap.stretch_mode_enabled) {
 956     dout(20) << "Stretch mode enabled in this map" << dendl;
 957     mon.try_engage_stretch_mode();
 958     if (osdmap.degraded_stretch_mode) {
 959       dout(20) << "Degraded stretch mode set in this map" << dendl;
 960       if (!osdmap.recovering_stretch_mode) {
 961         mon.set_degraded_stretch_mode();
 962         if (prev_num_up_osd < osdmap.num_up_osd &&
 963             (osdmap.num_up_osd / (double)osdmap.num_osd) >
 964             cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
 965           // TODO: This works for 2-site clusters when the OSD maps are appropriately
 966           // trimmed and everything is "normal" but not if you have a lot of out OSDs
 967           // you're ignoring or in some really degenerate failure cases
 968           dout(10) << "Enabling recovery stretch mode in this map" << dendl;
 969           mon.go_recovery_stretch_mode();
 970         }
 971       } else {
 972         mon.set_recovery_stretch_mode();
 973       }
 974     } else {
 975       mon.set_healthy_stretch_mode();
 976     }
 977     if (marked_osd_down &&
 978         (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
 979       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
 980       mon.maybe_go_degraded_stretch_mode();
 981     }
 982   }
 983 }
 984
 985 int OSDMonitor::register_cache_with_pcm()
 986 {
 987   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 988     derr << __func__ << " Invalid memory size specified for mon caches."
 989          << " Caches will not be auto-tuned."
 990          << dendl;
 991     return -EINVAL;
 992   }
 993   uint64_t base = mon_memory_base;
 994   double fragmentation = mon_memory_fragmentation;
 995   // For calculating total target memory, consider rocksdb cache size.
 996   uint64_t target = mon_memory_target;
 997   uint64_t min = mon_memory_min;
 998   uint64_t max = min;
 999
1000   // Apply the same logic as in bluestore to set the max amount
1001   // of memory to use for cache. Assume base memory for OSDMaps
1002   // and then add in some overhead for fragmentation.
1003   uint64_t ltarget = (1.0 - fragmentation) * target;
1004   if (ltarget > base + min) {
1005     max = ltarget - base;
1006   }
1007
1008   rocksdb_binned_kv_cache = mon.store->get_priority_cache();
1009   if (!rocksdb_binned_kv_cache) {
1010     derr << __func__ << " not using rocksdb" << dendl;
1011     return -EINVAL;
1012   }
1013
1014   int r = _set_cache_ratios();
1015   if (r < 0) {
1016     derr << __func__ << " Cache ratios for pcm could not be set."
1017          << " Review the kv (rocksdb) and mon_memory_target sizes."
1018          << dendl;
1019     return -EINVAL;
1020   }
1021
1022   pcm = std::make_shared<PriorityCache::Manager>(
1023       cct, min, max, target, true);
1024   pcm->insert("kv", rocksdb_binned_kv_cache, true);
1025   pcm->insert("inc", inc_cache, true);
1026   pcm->insert("full", full_cache, true);
1027   dout(1) << __func__ << " pcm target: " << target
1028            << " pcm max: " << max
1029            << " pcm min: " << min
1030            << " inc_osd_cache size: " << inc_osd_cache.get_size()
1031            << dendl;
1032   return 0;
1033 }
1034
1035 int OSDMonitor::_set_cache_ratios()
1036 {
1037   double old_cache_kv_ratio = cache_kv_ratio;
1038
1039   // Set the cache ratios for kv(rocksdb), inc and full caches
1040   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1041   if (cache_kv_ratio >= 1.0) {
1042     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1043          << ") must be in range [0,<1.0]."
1044          << dendl;
1045     cache_kv_ratio = old_cache_kv_ratio;
1046     return -EINVAL;
1047   }
1048   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1049   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1050   inc_cache->set_cache_ratio(cache_inc_ratio);
1051   full_cache->set_cache_ratio(cache_full_ratio);
1052
1053   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1054            << " inc ratio " << cache_inc_ratio
1055            << " full ratio " << cache_full_ratio
1056            << dendl;
1057   return 0;
1058 }
1059
1060 void OSDMonitor::start_mapping()
1061 {
1062   // initiate mapping job
1063   if (mapping_job) {
1064     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1065              << dendl;
1066     mapping_job->abort();
1067   }
1068   if (!osdmap.get_pools().empty()) {
1069     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1070     mapping_job = mapping.start_update(osdmap, mapper,
1071                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
1072     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1073              << " at " << fin->start << dendl;
1074     mapping_job->set_finish_event(fin);
1075   } else {
1076     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1077     mapping_job = nullptr;
1078   }
1079 }
1080
1081 void OSDMonitor::update_msgr_features()
1082 {
1083   const int types[] = {
1084     entity_name_t::TYPE_OSD,
1085     entity_name_t::TYPE_CLIENT,
1086     entity_name_t::TYPE_MDS,
1087     entity_name_t::TYPE_MON
1088   };
1089   for (int type : types) {
1090     uint64_t mask;
1091     uint64_t features = osdmap.get_features(type, &mask);
1092     if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1093       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1094       ceph::net::Policy p = mon.messenger->get_policy(type);
1095       p.features_required = (p.features_required & ~mask) | features;
1096       mon.messenger->set_policy(type, p);
1097     }
1098   }
1099 }
1100
1101 void OSDMonitor::on_active()
1102 {
1103   update_logger();
1104
1105   if (mon.is_leader()) {
1106     mon.clog->debug() << "osdmap " << osdmap;
1107     if (!priority_convert) {
1108       // Only do this once at start-up
1109       convert_pool_priorities();
1110       priority_convert = true;
1111     }
1112   } else {
1113     list<MonOpRequestRef> ls;
1114     take_all_failures(ls);
1115     while (!ls.empty()) {
1116       MonOpRequestRef op = ls.front();
1117       op->mark_osdmon_event(__func__);
1118       dispatch(op);
1119       ls.pop_front();
1120     }
1121   }
1122   start_mapping();
1123 }
1124
1125 void OSDMonitor::on_restart()
1126 {
1127   last_osd_report.clear();
1128 }
1129
1130 void OSDMonitor::on_shutdown()
1131 {
1132   dout(10) << __func__ << dendl;
1133   if (mapping_job) {
1134     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1135              << dendl;
1136     mapping_job->abort();
1137   }
1138
1139   // discard failure info, waiters
1140   list<MonOpRequestRef> ls;
1141   take_all_failures(ls);
1142   ls.clear();
1143 }
1144
1145 void OSDMonitor::update_logger()
1146 {
1147   dout(10) << "update_logger" << dendl;
1148
1149   mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1150   mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1151   mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1152   mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1153 }
1154
1155 void OSDMonitor::create_pending()
1156 {
1157   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1158   pending_inc.fsid = mon.monmap->fsid;
1159   pending_metadata.clear();
1160   pending_metadata_rm.clear();
1161   pending_pseudo_purged_snaps.clear();
1162
1163   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1164
1165   // safety checks (this shouldn't really happen)
1166   {
1167     if (osdmap.backfillfull_ratio <= 0) {
1168       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1169       if (pending_inc.new_backfillfull_ratio > 1.0)
1170         pending_inc.new_backfillfull_ratio /= 100;
1171       dout(1) << __func__ << " setting backfillfull_ratio = "
1172               << pending_inc.new_backfillfull_ratio << dendl;
1173     }
1174     if (osdmap.full_ratio <= 0) {
1175       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1176       if (pending_inc.new_full_ratio > 1.0)
1177         pending_inc.new_full_ratio /= 100;
1178       dout(1) << __func__ << " setting full_ratio = "
1179               << pending_inc.new_full_ratio << dendl;
1180     }
1181     if (osdmap.nearfull_ratio <= 0) {
1182       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1183       if (pending_inc.new_nearfull_ratio > 1.0)
1184         pending_inc.new_nearfull_ratio /= 100;
1185       dout(1) << __func__ << " setting nearfull_ratio = "
1186               << pending_inc.new_nearfull_ratio << dendl;
1187     }
1188   }
1189 }
1190
1191 creating_pgs_t
1192 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1193                                const OSDMap& nextmap)
1194 {
1195   dout(10) << __func__ << dendl;
1196   creating_pgs_t pending_creatings;
1197   {
1198     std::lock_guard<std::mutex> l(creating_pgs_lock);
1199     pending_creatings = creating_pgs;
1200   }
1201   // check for new or old pools
1202   if (pending_creatings.last_scan_epoch < inc.epoch) {
1203     unsigned queued = 0;
1204     queued += scan_for_creating_pgs(osdmap.get_pools(),
1205                                     inc.old_pools,
1206                                     inc.modified,
1207                                     &pending_creatings);
1208     queued += scan_for_creating_pgs(inc.new_pools,
1209                                     inc.old_pools,
1210                                     inc.modified,
1211                                     &pending_creatings);
1212     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1213     for (auto deleted_pool : inc.old_pools) {
1214       auto removed = pending_creatings.remove_pool(deleted_pool);
1215       dout(10) << __func__ << " " << removed
1216                << " pg removed because containing pool deleted: "
1217                << deleted_pool << dendl;
1218       last_epoch_clean.remove_pool(deleted_pool);
1219     }
1220     // pgmon updates its creating_pgs in check_osd_map() which is called by
1221     // on_active() and check_osd_map() could be delayed if lease expires, so its
1222     // creating_pgs could be stale in comparison with the one of osdmon. let's
1223     // trim them here. otherwise, they will be added back after being erased.
1224     unsigned removed = 0;
1225     for (auto& pg : pending_created_pgs) {
1226       dout(20) << __func__ << " noting created pg " << pg << dendl;
1227       pending_creatings.created_pools.insert(pg.pool());
1228       removed += pending_creatings.pgs.erase(pg);
1229     }
1230     pending_created_pgs.clear();
1231     dout(10) << __func__ << " " << removed
1232              << " pgs removed because they're created" << dendl;
1233     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1234   }
1235
1236   // filter out any pgs that shouldn't exist.
1237   {
1238     auto i = pending_creatings.pgs.begin();
1239     while (i != pending_creatings.pgs.end()) {
1240       if (!nextmap.pg_exists(i->first)) {
1241         dout(10) << __func__ << " removing pg " << i->first
1242                  << " which should not exist" << dendl;
1243         i = pending_creatings.pgs.erase(i);
1244       } else {
1245         ++i;
1246       }
1247     }
1248   }
1249
1250   // process queue
1251   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1252   const auto total = pending_creatings.pgs.size();
1253   while (pending_creatings.pgs.size() < max &&
1254          !pending_creatings.queue.empty()) {
1255     auto p = pending_creatings.queue.begin();
1256     int64_t poolid = p->first;
1257     dout(10) << __func__ << " pool " << poolid
1258              << " created " << p->second.created
1259              << " modified " << p->second.modified
1260              << " [" << p->second.start << "-" << p->second.end << ")"
1261              << dendl;
1262     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1263                                   p->second.end - p->second.start);
1264     ps_t first = p->second.start;
1265     ps_t end = first + n;
1266     for (ps_t ps = first; ps < end; ++ps) {
1267       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1268       // NOTE: use the *current* epoch as the PG creation epoch so that the
1269       // OSD does not have to generate a long set of PastIntervals.
1270       pending_creatings.pgs.emplace(
1271         pgid,
1272         creating_pgs_t::pg_create_info(inc.epoch,
1273                                        p->second.modified));
1274       dout(10) << __func__ << " adding " << pgid << dendl;
1275     }
1276     p->second.start = end;
1277     if (p->second.done()) {
1278       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1279       pending_creatings.queue.erase(p);
1280     } else {
1281       dout(10) << __func__ << " pool " << poolid
1282                << " now [" << p->second.start << "-" << p->second.end << ")"
1283                << dendl;
1284     }
1285   }
1286   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1287            << " pools" << dendl;
1288
1289   if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1290     // walk creating pgs' history and past_intervals forward
1291     for (auto& i : pending_creatings.pgs) {
1292       // this mirrors PG::start_peering_interval()
1293       pg_t pgid = i.first;
1294
1295       // this is a bit imprecise, but sufficient?
1296       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1297         const pg_pool_t *pi;
1298         bool operator()(const set<pg_shard_t> &have) const {
1299           return have.size() >= pi->min_size;
1300         }
1301         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1302       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1303
1304       vector<int> up, acting;
1305       int up_primary, acting_primary;
1306       nextmap.pg_to_up_acting_osds(
1307         pgid, &up, &up_primary, &acting, &acting_primary);
1308       if (i.second.history.epoch_created == 0) {
1309         // new pg entry, set it up
1310         i.second.up = up;
1311         i.second.acting = acting;
1312         i.second.up_primary = up_primary;
1313         i.second.acting_primary = acting_primary;
1314         i.second.history = pg_history_t(i.second.create_epoch,
1315                                         i.second.create_stamp);
1316         dout(10) << __func__ << "  pg " << pgid << " just added, "
1317                  << " up " << i.second.up
1318                  << " p " << i.second.up_primary
1319                  << " acting " << i.second.acting
1320                  << " p " << i.second.acting_primary
1321                  << " history " << i.second.history
1322                  << " past_intervals " << i.second.past_intervals
1323                  << dendl;
1324      } else {
1325         std::stringstream debug;
1326         if (PastIntervals::check_new_interval(
1327               i.second.acting_primary, acting_primary,
1328               i.second.acting, acting,
1329               i.second.up_primary, up_primary,
1330               i.second.up, up,
1331               i.second.history.same_interval_since,
1332               i.second.history.last_epoch_clean,
1333               &nextmap,
1334               &osdmap,
1335               pgid,
1336               min_size_predicate,
1337               &i.second.past_intervals,
1338               &debug)) {
1339           epoch_t e = inc.epoch;
1340           i.second.history.same_interval_since = e;
1341           if (i.second.up != up) {
1342             i.second.history.same_up_since = e;
1343           }
1344           if (i.second.acting_primary != acting_primary) {
1345             i.second.history.same_primary_since = e;
1346           }
1347           if (pgid.is_split(
1348                 osdmap.get_pg_num(pgid.pool()),
1349                 nextmap.get_pg_num(pgid.pool()),
1350                 nullptr)) {
1351             i.second.history.last_epoch_split = e;
1352           }
1353           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1354                    << " up " << i.second.up << " -> " << up
1355                    << " p " << i.second.up_primary << " -> " << up_primary
1356                    << " acting " << i.second.acting << " -> " << acting
1357                    << " p " << i.second.acting_primary << " -> "
1358                    << acting_primary
1359                    << " history " << i.second.history
1360                    << " past_intervals " << i.second.past_intervals
1361                    << dendl;
1362           dout(20) << "  debug: " << debug.str() << dendl;
1363           i.second.up = up;
1364           i.second.acting = acting;
1365           i.second.up_primary = up_primary;
1366           i.second.acting_primary = acting_primary;
1367         }
1368       }
1369     }
1370   }
1371   dout(10) << __func__
1372            << " " << (pending_creatings.pgs.size() - total)
1373            << "/" << pending_creatings.pgs.size()
1374            << " pgs added from queued pools" << dendl;
1375   return pending_creatings;
1376 }
1377
1378 void OSDMonitor::maybe_prime_pg_temp()
1379 {
1380   bool all = false;
1381   if (pending_inc.crush.length()) {
1382     dout(10) << __func__ << " new crush map, all" << dendl;
1383     all = true;
1384   }
1385
1386   if (!pending_inc.new_up_client.empty()) {
1387     dout(10) << __func__ << " new up osds, all" << dendl;
1388     all = true;
1389   }
1390
1391   // check for interesting OSDs
1392   set<int> osds;
1393   for (auto p = pending_inc.new_state.begin();
1394        !all && p != pending_inc.new_state.end();
1395        ++p) {
1396     if ((p->second & CEPH_OSD_UP) &&
1397         osdmap.is_up(p->first)) {
1398       osds.insert(p->first);
1399     }
1400   }
1401   for (auto p = pending_inc.new_weight.begin();
1402        !all && p != pending_inc.new_weight.end();
1403        ++p) {
1404     if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1405       // weight reduction
1406       osds.insert(p->first);
1407     } else {
1408       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1409                << dendl;
1410       all = true;
1411     }
1412   }
1413
1414   if (!all && osds.empty())
1415     return;
1416
1417   if (!all) {
1418     unsigned estimate =
1419       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1420     if (estimate > mapping.get_num_pgs() *
1421         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1422       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1423                << osds.size() << " osds >= "
1424                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1425                << mapping.get_num_pgs() << " pgs, all"
1426                << dendl;
1427       all = true;
1428     } else {
1429       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1430                << osds.size() << " osds" << dendl;
1431     }
1432   }
1433
1434   OSDMap next;
1435   next.deepish_copy_from(osdmap);
1436   next.apply_incremental(pending_inc);
1437
1438   if (next.get_pools().empty()) {
1439     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1440   } else if (all) {
1441     PrimeTempJob job(next, this);
1442     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1443     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1444       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1445     } else {
1446       dout(10) << __func__ << " did not finish in "
1447                << g_conf()->mon_osd_prime_pg_temp_max_time
1448                << ", stopping" << dendl;
1449       job.abort();
1450     }
1451   } else {
1452     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1453     utime_t stop = ceph_clock_now();
1454     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1455     const int chunk = 1000;
1456     int n = chunk;
1457     std::unordered_set<pg_t> did_pgs;
1458     for (auto osd : osds) {
1459       auto& pgs = mapping.get_osd_acting_pgs(osd);
1460       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1461       for (auto pgid : pgs) {
1462         if (!did_pgs.insert(pgid).second) {
1463           continue;
1464         }
1465         prime_pg_temp(next, pgid);
1466         if (--n <= 0) {
1467           n = chunk;
1468           if (ceph_clock_now() > stop) {
1469             dout(10) << __func__ << " consumed more than "
1470                      << g_conf()->mon_osd_prime_pg_temp_max_time
1471                      << " seconds, stopping"
1472                      << dendl;
1473             return;
1474           }
1475         }
1476       }
1477     }
1478   }
1479 }
1480
1481 void OSDMonitor::prime_pg_temp(
1482   const OSDMap& next,
1483   pg_t pgid)
1484 {
1485   // TODO: remove this creating_pgs direct access?
1486   if (creating_pgs.pgs.count(pgid)) {
1487     return;
1488   }
1489   if (!osdmap.pg_exists(pgid)) {
1490     return;
1491   }
1492
1493   vector<int> up, acting;
1494   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1495
1496   vector<int> next_up, next_acting;
1497   int next_up_primary, next_acting_primary;
1498   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1499                             &next_acting, &next_acting_primary);
1500   if (acting == next_acting &&
1501       !(up != acting && next_up == next_acting))
1502     return;  // no change since last epoch
1503
1504   if (acting.empty())
1505     return;  // if previously empty now we can be no worse off
1506   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1507   if (pool && acting.size() < pool->min_size)
1508     return;  // can be no worse off than before
1509
1510   if (next_up == next_acting) {
1511     acting.clear();
1512     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1513              << dendl;
1514   }
1515
1516   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1517            << " -> " << next_up << "/" << next_acting
1518            << ", priming " << acting
1519            << dendl;
1520   {
1521     std::lock_guard l(prime_pg_temp_lock);
1522     // do not touch a mapping if a change is pending
1523     pending_inc.new_pg_temp.emplace(
1524       pgid,
1525       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1526   }
1527 }
1528
1529 /**
1530  * @note receiving a transaction in this function gives a fair amount of
1531  * freedom to the service implementation if it does need it. It shouldn't.
1532  */
1533 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1534 {
1535   dout(10) << "encode_pending e " << pending_inc.epoch
1536            << dendl;
1537
1538   if (do_prune(t)) {
1539     dout(1) << __func__ << " osdmap full prune encoded e"
1540             << pending_inc.epoch << dendl;
1541   }
1542
1543   // finalize up pending_inc
1544   pending_inc.modified = ceph_clock_now();
1545
1546   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1547   ceph_assert(r == 0);
1548
1549   if (mapping_job) {
1550     if (!mapping_job->is_done()) {
1551       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1552               << mapping_job.get() << " did not complete, "
1553               << mapping_job->shards << " left" << dendl;
1554       mapping_job->abort();
1555     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1556       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1557               << mapping_job.get() << " is prior epoch "
1558               << mapping.get_epoch() << dendl;
1559     } else {
1560       if (g_conf()->mon_osd_prime_pg_temp) {
1561         maybe_prime_pg_temp();
1562       }
1563     }
1564   } else if (g_conf()->mon_osd_prime_pg_temp) {
1565     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1566             << dendl;
1567   }
1568   mapping_job.reset();
1569
1570   // ensure we don't have blank new_state updates.  these are interrpeted as
1571   // CEPH_OSD_UP (and almost certainly not what we want!).
1572   auto p = pending_inc.new_state.begin();
1573   while (p != pending_inc.new_state.end()) {
1574     if (p->second == 0) {
1575       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1576       p = pending_inc.new_state.erase(p);
1577     } else {
1578       if (p->second & CEPH_OSD_UP) {
1579         pending_inc.new_last_up_change = pending_inc.modified;
1580       }
1581       ++p;
1582     }
1583   }
1584   if (!pending_inc.new_up_client.empty()) {
1585     pending_inc.new_last_up_change = pending_inc.modified;
1586   }
1587   for (auto& i : pending_inc.new_weight) {
1588     if (i.first >= osdmap.max_osd) {
1589       if (i.second) {
1590         // new osd is already marked in
1591         pending_inc.new_last_in_change = pending_inc.modified;
1592         break;
1593       }
1594     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1595       // existing osd marked in or out
1596       pending_inc.new_last_in_change = pending_inc.modified;
1597       break;
1598     }
1599   }
1600
1601   {
1602     OSDMap tmp;
1603     tmp.deepish_copy_from(osdmap);
1604     tmp.apply_incremental(pending_inc);
1605
1606     // clean pg_temp mappings
1607     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1608
1609     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1610     {
1611       // check every upmapped pg for now
1612       // until we could reliably identify certain cases to ignore,
1613       // which is obviously the hard part TBD..
1614       vector<pg_t> pgs_to_check;
1615       tmp.get_upmap_pgs(&pgs_to_check);
1616       if (pgs_to_check.size() <
1617           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1618         // not enough pgs, do it inline
1619         tmp.clean_pg_upmaps(cct, &pending_inc);
1620       } else {
1621         CleanUpmapJob job(cct, tmp, pending_inc);
1622         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1623         job.wait();
1624       }
1625     }
1626
1627     // update creating pgs first so that we can remove the created pgid and
1628     // process the pool flag removal below in the same osdmap epoch.
1629     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1630     bufferlist creatings_bl;
1631     uint64_t features = CEPH_FEATURES_ALL;
1632     if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1633       dout(20) << __func__ << " encoding pending pgs without octopus features"
1634                << dendl;
1635       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1636     }
1637     encode(pending_creatings, creatings_bl, features);
1638     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1639
1640     // remove any old (or incompat) POOL_CREATING flags
1641     for (auto& i : tmp.get_pools()) {
1642       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1643         // pre-nautilus OSDMaps shouldn't get this flag.
1644         if (pending_inc.new_pools.count(i.first)) {
1645           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1646         }
1647       }
1648       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1649           !pending_creatings.still_creating_pool(i.first)) {
1650         dout(10) << __func__ << " done creating pool " << i.first
1651                  << ", clearing CREATING flag" << dendl;
1652         if (pending_inc.new_pools.count(i.first) == 0) {
1653           pending_inc.new_pools[i.first] = i.second;
1654         }
1655         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1656       }
1657     }
1658
1659     // collect which pools are currently affected by
1660     // the near/backfill/full osd(s),
1661     // and set per-pool near/backfill/full flag instead
1662     set<int64_t> full_pool_ids;
1663     set<int64_t> backfillfull_pool_ids;
1664     set<int64_t> nearfull_pool_ids;
1665     tmp.get_full_pools(cct,
1666                        &full_pool_ids,
1667                        &backfillfull_pool_ids,
1668                          &nearfull_pool_ids);
1669     if (full_pool_ids.empty() ||
1670         backfillfull_pool_ids.empty() ||
1671         nearfull_pool_ids.empty()) {
1672       // normal case - no nearfull, backfillfull or full osds
1673         // try cancel any improper nearfull/backfillfull/full pool
1674         // flags first
1675       for (auto &pool: tmp.get_pools()) {
1676         auto p = pool.first;
1677         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1678             nearfull_pool_ids.empty()) {
1679           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1680                    << "'s nearfull flag" << dendl;
1681           if (pending_inc.new_pools.count(p) == 0) {
1682             // load original pool info first!
1683             pending_inc.new_pools[p] = pool.second;
1684           }
1685           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1686         }
1687         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1688             backfillfull_pool_ids.empty()) {
1689           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1690                    << "'s backfillfull flag" << dendl;
1691           if (pending_inc.new_pools.count(p) == 0) {
1692             pending_inc.new_pools[p] = pool.second;
1693           }
1694           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1695         }
1696         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1697             full_pool_ids.empty()) {
1698           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1699             // set by EQUOTA, skipping
1700             continue;
1701           }
1702           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1703                    << "'s full flag" << dendl;
1704           if (pending_inc.new_pools.count(p) == 0) {
1705             pending_inc.new_pools[p] = pool.second;
1706           }
1707           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1708         }
1709       }
1710     }
1711     if (!full_pool_ids.empty()) {
1712       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1713                << " as full" << dendl;
1714       for (auto &p: full_pool_ids) {
1715         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1716           continue;
1717         }
1718         if (pending_inc.new_pools.count(p) == 0) {
1719           pending_inc.new_pools[p] = tmp.pools[p];
1720         }
1721         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1722         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1723         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1724       }
1725       // cancel FLAG_FULL for pools which are no longer full too
1726       for (auto &pool: tmp.get_pools()) {
1727         auto p = pool.first;
1728         if (full_pool_ids.count(p)) {
1729           // skip pools we have just marked as full above
1730           continue;
1731         }
1732         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1733             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1734           // don't touch if currently is not full
1735           // or is running out of quota (and hence considered as full)
1736           continue;
1737         }
1738         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1739                  << "'s full flag" << dendl;
1740         if (pending_inc.new_pools.count(p) == 0) {
1741           pending_inc.new_pools[p] = pool.second;
1742         }
1743         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1744       }
1745     }
1746     if (!backfillfull_pool_ids.empty()) {
1747       for (auto &p: backfillfull_pool_ids) {
1748         if (full_pool_ids.count(p)) {
1749           // skip pools we have already considered as full above
1750           continue;
1751         }
1752         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1753           // make sure FLAG_FULL is truly set, so we are safe not
1754           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1755           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1756           continue;
1757         }
1758         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1759           // don't bother if pool is already marked as backfillfull
1760           continue;
1761         }
1762         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1763                  << "'s as backfillfull" << dendl;
1764         if (pending_inc.new_pools.count(p) == 0) {
1765           pending_inc.new_pools[p] = tmp.pools[p];
1766         }
1767         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1768         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1769       }
1770       // cancel FLAG_BACKFILLFULL for pools
1771       // which are no longer backfillfull too
1772       for (auto &pool: tmp.get_pools()) {
1773         auto p = pool.first;
1774         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1775           // skip pools we have just marked as backfillfull/full above
1776           continue;
1777         }
1778         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1779           // and don't touch if currently is not backfillfull
1780           continue;
1781         }
1782         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1783                  << "'s backfillfull flag" << dendl;
1784         if (pending_inc.new_pools.count(p) == 0) {
1785           pending_inc.new_pools[p] = pool.second;
1786         }
1787         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1788       }
1789     }
1790     if (!nearfull_pool_ids.empty()) {
1791       for (auto &p: nearfull_pool_ids) {
1792         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1793           continue;
1794         }
1795         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1796           // make sure FLAG_FULL is truly set, so we are safe not
1797           // to set a extra (redundant) FLAG_NEARFULL flag
1798           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1799           continue;
1800         }
1801         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1802           // don't bother if pool is already marked as nearfull
1803           continue;
1804         }
1805         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1806                  << "'s as nearfull" << dendl;
1807         if (pending_inc.new_pools.count(p) == 0) {
1808           pending_inc.new_pools[p] = tmp.pools[p];
1809         }
1810         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1811       }
1812       // cancel FLAG_NEARFULL for pools
1813       // which are no longer nearfull too
1814       for (auto &pool: tmp.get_pools()) {
1815         auto p = pool.first;
1816         if (full_pool_ids.count(p) ||
1817             backfillfull_pool_ids.count(p) ||
1818             nearfull_pool_ids.count(p)) {
1819           // skip pools we have just marked as
1820           // nearfull/backfillfull/full above
1821           continue;
1822         }
1823         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1824           // and don't touch if currently is not nearfull
1825           continue;
1826         }
1827         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1828                  << "'s nearfull flag" << dendl;
1829         if (pending_inc.new_pools.count(p) == 0) {
1830           pending_inc.new_pools[p] = pool.second;
1831         }
1832         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1833       }
1834     }
1835
1836     // min_compat_client?
1837     if (!tmp.require_min_compat_client) {
1838       auto mv = tmp.get_min_compat_client();
1839       dout(1) << __func__ << " setting require_min_compat_client to currently "
1840               << "required " << mv << dendl;
1841       mon.clog->info() << "setting require_min_compat_client to currently "
1842                         << "required " << mv;
1843       pending_inc.new_require_min_compat_client = mv;
1844     }
1845
1846     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1847         tmp.require_osd_release >= ceph_release_t::nautilus) {
1848       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1849       // add creating flags?
1850       for (auto& i : tmp.get_pools()) {
1851         if (pending_creatings.still_creating_pool(i.first)) {
1852           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1853                    << dendl;
1854           if (pending_inc.new_pools.count(i.first) == 0) {
1855             pending_inc.new_pools[i.first] = i.second;
1856           }
1857           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1858         }
1859       }
1860       // adjust blocklist items to all be TYPE_ANY
1861       for (auto& i : tmp.blocklist) {
1862         auto a = i.first;
1863         a.set_type(entity_addr_t::TYPE_ANY);
1864         pending_inc.new_blocklist[a] = i.second;
1865         pending_inc.old_blocklist.push_back(i.first);
1866       }
1867     }
1868
1869     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1870         tmp.require_osd_release >= ceph_release_t::octopus) {
1871       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1872
1873       // adjust obsoleted cache modes
1874       for (auto& [poolid, pi] : tmp.pools) {
1875         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1876           if (pending_inc.new_pools.count(poolid) == 0) {
1877             pending_inc.new_pools[poolid] = pi;
1878           }
1879           dout(10) << __func__ << " switching pool " << poolid
1880                    << " cachemode from forward -> proxy" << dendl;
1881           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1882         }
1883         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1884           if (pending_inc.new_pools.count(poolid) == 0) {
1885             pending_inc.new_pools[poolid] = pi;
1886           }
1887           dout(10) << __func__ << " switching pool " << poolid
1888                    << " cachemode from readforward -> readproxy" << dendl;
1889           pending_inc.new_pools[poolid].cache_mode =
1890             pg_pool_t::CACHEMODE_READPROXY;
1891         }
1892       }
1893
1894       // clear removed_snaps for every pool
1895       for (auto& [poolid, pi] : tmp.pools) {
1896         if (pi.removed_snaps.empty()) {
1897           continue;
1898         }
1899         if (pending_inc.new_pools.count(poolid) == 0) {
1900           pending_inc.new_pools[poolid] = pi;
1901         }
1902         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1903                  << dendl;
1904         pending_inc.new_pools[poolid].removed_snaps.clear();
1905       }
1906
1907       // create a combined purged snap epoch key for all purged snaps
1908       // prior to this epoch, and store it in the current epoch (i.e.,
1909       // the last pre-octopus epoch, just prior to the one we're
1910       // encoding now).
1911       auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1912       it->lower_bound("purged_snap_");
1913       map<int64_t,snap_interval_set_t> combined;
1914       while (it->valid()) {
1915         if (it->key().find("purged_snap_") != 0) {
1916           break;
1917         }
1918         string k = it->key();
1919         long long unsigned pool;
1920         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1921         if (n != 1) {
1922           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1923         } else {
1924           bufferlist v = it->value();
1925           auto p = v.cbegin();
1926           snapid_t begin, end;
1927           ceph::decode(begin, p);
1928           ceph::decode(end, p);
1929           combined[pool].insert(begin, end - begin);
1930         }
1931         it->next();
1932       }
1933       if (!combined.empty()) {
1934         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1935         bufferlist v;
1936         ceph::encode(combined, v);
1937         t->put(OSD_SNAP_PREFIX, k, v);
1938         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1939                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1940                  << dendl;
1941       } else {
1942         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1943                  << dendl;
1944       }
1945
1946       // clean out the old removed_snap_ and removed_epoch keys
1947       // ('`' is ASCII '_' + 1)
1948       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1949       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1950     }
1951   }
1952
1953   // tell me about it
1954   for (auto i = pending_inc.new_state.begin();
1955        i != pending_inc.new_state.end();
1956        ++i) {
1957     int s = i->second ? i->second : CEPH_OSD_UP;
1958     if (s & CEPH_OSD_UP) {
1959       dout(2) << " osd." << i->first << " DOWN" << dendl;
1960       // Reset laggy parameters if failure interval exceeds a threshold.
1961       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1962       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1963         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1964         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1965           set_default_laggy_params(i->first);
1966         }
1967       }
1968     }
1969     if (s & CEPH_OSD_EXISTS)
1970       dout(2) << " osd." << i->first << " DNE" << dendl;
1971   }
1972   for (auto i = pending_inc.new_up_client.begin();
1973        i != pending_inc.new_up_client.end();
1974        ++i) {
1975     //FIXME: insert cluster addresses too
1976     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
1977   }
1978   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
1979        i != pending_inc.new_weight.end();
1980        ++i) {
1981     if (i->second == CEPH_OSD_OUT) {
1982       dout(2) << " osd." << i->first << " OUT" << dendl;
1983     } else if (i->second == CEPH_OSD_IN) {
1984       dout(2) << " osd." << i->first << " IN" << dendl;
1985     } else {
1986       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
1987     }
1988   }
1989
1990   // features for osdmap and its incremental
1991   uint64_t features;
1992
1993   // encode full map and determine its crc
1994   OSDMap tmp;
1995   {
1996     tmp.deepish_copy_from(osdmap);
1997     tmp.apply_incremental(pending_inc);
1998
1999     // determine appropriate features
2000     features = tmp.get_encoding_features();
2001     dout(10) << __func__ << " encoding full map with "
2002              << tmp.require_osd_release
2003              << " features " << features << dendl;
2004
2005     // the features should be a subset of the mon quorum's features!
2006     ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2007
2008     bufferlist fullbl;
2009     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2010     pending_inc.full_crc = tmp.get_crc();
2011
2012     // include full map in the txn.  note that old monitors will
2013     // overwrite this.  new ones will now skip the local full map
2014     // encode and reload from this.
2015     put_version_full(t, pending_inc.epoch, fullbl);
2016   }
2017
2018   // encode
2019   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2020   bufferlist bl;
2021   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2022
2023   dout(20) << " full_crc " << tmp.get_crc()
2024            << " inc_crc " << pending_inc.inc_crc << dendl;
2025
2026   /* put everything in the transaction */
2027   put_version(t, pending_inc.epoch, bl);
2028   put_last_committed(t, pending_inc.epoch);
2029
2030   // metadata, too!
2031   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2032        p != pending_metadata.end();
2033        ++p) {
2034     Metadata m;
2035     auto mp = p->second.cbegin();
2036     decode(m, mp);
2037     auto it = m.find("osd_objectstore");
2038     if (it != m.end()) {
2039       if (it->second == "filestore") {
2040         filestore_osds.insert(p->first);
2041       } else {
2042         filestore_osds.erase(p->first);
2043       }
2044     }
2045     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2046   }
2047   for (set<int>::iterator p = pending_metadata_rm.begin();
2048        p != pending_metadata_rm.end();
2049        ++p) {
2050     filestore_osds.erase(*p);
2051     t->erase(OSD_METADATA_PREFIX, stringify(*p));
2052   }
2053   pending_metadata.clear();
2054   pending_metadata_rm.clear();
2055
2056   // purged_snaps
2057   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2058       !pending_inc.new_purged_snaps.empty()) {
2059     // all snaps purged this epoch (across all pools)
2060     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2061     bufferlist v;
2062     encode(pending_inc.new_purged_snaps, v);
2063     t->put(OSD_SNAP_PREFIX, k, v);
2064   }
2065   for (auto& i : pending_inc.new_purged_snaps) {
2066     for (auto q = i.second.begin();
2067          q != i.second.end();
2068          ++q) {
2069       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2070                                 pending_inc.epoch,
2071                                 t);
2072     }
2073   }
2074   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2075     for (auto snap : snaps) {
2076       insert_purged_snap_update(pool, snap, snap + 1,
2077                                 pending_inc.epoch,
2078                                 t);
2079     }
2080   }
2081
2082   // health
2083   health_check_map_t next;
2084   tmp.check_health(cct, &next);
2085   // OSD_FILESTORE
2086   check_for_filestore_osds(&next);
2087   encode_health(next, t);
2088 }
2089
2090 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2091 {
2092   bufferlist bl;
2093   int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2094   if (r < 0)
2095     return r;
2096   try {
2097     auto p = bl.cbegin();
2098     decode(m, p);
2099   }
2100   catch (ceph::buffer::error& e) {
2101     if (err)
2102       *err << "osd." << osd << " metadata is corrupt";
2103     return -EIO;
2104   }
2105   return 0;
2106 }
2107
2108 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2109 {
2110   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2111     if (osdmap.is_up(osd)) {
2112       map<string,string> meta;
2113       load_metadata(osd, meta, nullptr);
2114       auto p = meta.find(field);
2115       if (p == meta.end()) {
2116         (*out)["unknown"]++;
2117       } else {
2118         (*out)[p->second]++;
2119       }
2120     }
2121   }
2122 }
2123
2124 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2125 {
2126   map<string,int> by_val;
2127   count_metadata(field, &by_val);
2128   f->open_object_section(field.c_str());
2129   for (auto& p : by_val) {
2130     f->dump_int(p.first.c_str(), p.second);
2131   }
2132   f->close_section();
2133 }
2134
2135 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2136 {
2137   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2138     if (osdmap.is_up(osd)) {
2139       map<string,string> meta;
2140       load_metadata(osd, meta, nullptr);
2141       auto p = meta.find("ceph_version_short");
2142       if (p == meta.end()) continue;
2143       versions[p->second].push_back(string("osd.") + stringify(osd));
2144     }
2145   }
2146 }
2147
2148 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2149 {
2150   map<string, string> metadata;
2151   int r = load_metadata(osd, metadata, nullptr);
2152   if (r < 0)
2153     return r;
2154
2155   auto it = metadata.find("osd_objectstore");
2156   if (it == metadata.end())
2157     return -ENOENT;
2158   *type = it->second;
2159   return 0;
2160 }
2161
2162 void OSDMonitor::get_filestore_osd_list()
2163 {
2164   for (unsigned osd = 0; osd < osdmap.get_num_osds(); ++osd) {
2165     string objectstore_type;
2166     int r = get_osd_objectstore_type(osd, &objectstore_type);
2167     if (r == 0 && objectstore_type == "filestore") {
2168       filestore_osds.insert(osd);
2169     }
2170   }
2171 }
2172
2173 void OSDMonitor::check_for_filestore_osds(health_check_map_t *checks)
2174 {
2175   if (g_conf()->mon_warn_on_filestore_osds &&
2176       filestore_osds.size() > 0) {
2177     ostringstream ss, deprecated_tip;
2178     list<string> detail;
2179     ss << filestore_osds.size()
2180        << " osd(s) "
2181        << (filestore_osds.size() == 1 ? "is" : "are")
2182        << " running Filestore";
2183     deprecated_tip << ss.str();
2184     ss << " [Deprecated]";
2185     auto& d = checks->add("OSD_FILESTORE", HEALTH_WARN, ss.str(),
2186                           filestore_osds.size());
2187     deprecated_tip << ", which has been deprecated and"
2188                    << " not been optimized for QoS"
2189                    << " (Filestore OSDs will use 'osd_op_queue = wpq' strictly)";
2190     detail.push_back(deprecated_tip.str());
2191     d.detail.swap(detail);
2192   }
2193 }
2194
2195 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2196                                                  const pg_pool_t &pool,
2197                                                  ostream *err)
2198 {
2199   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2200   // since filestore osds could always join the pool later
2201   set<int> checked_osds;
2202   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2203     vector<int> up, acting;
2204     pg_t pgid(ps, pool_id);
2205     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2206     for (int osd : up) {
2207       if (checked_osds.find(osd) != checked_osds.end())
2208         continue;
2209       string objectstore_type;
2210       int r = get_osd_objectstore_type(osd, &objectstore_type);
2211       // allow with missing metadata, e.g. due to an osd never booting yet
2212       if (r < 0 || objectstore_type == "bluestore") {
2213         checked_osds.insert(osd);
2214         continue;
2215       }
2216       *err << "osd." << osd << " uses " << objectstore_type;
2217       return false;
2218     }
2219   }
2220   return true;
2221 }
2222
2223 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2224 {
2225   map<string,string> m;
2226   if (int r = load_metadata(osd, m, err))
2227     return r;
2228   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2229     f->dump_string(p->first.c_str(), p->second);
2230   return 0;
2231 }
2232
2233 void OSDMonitor::print_nodes(Formatter *f)
2234 {
2235   // group OSDs by their hosts
2236   map<string, list<int> > osds; // hostname => osd
2237   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2238     map<string, string> m;
2239     if (load_metadata(osd, m, NULL)) {
2240       continue;
2241     }
2242     map<string, string>::iterator hostname = m.find("hostname");
2243     if (hostname == m.end()) {
2244       // not likely though
2245       continue;
2246     }
2247     osds[hostname->second].push_back(osd);
2248   }
2249
2250   dump_services(f, osds, "osd");
2251 }
2252
2253 void OSDMonitor::share_map_with_random_osd()
2254 {
2255   if (osdmap.get_num_up_osds() == 0) {
2256     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2257     return;
2258   }
2259
2260   MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2261   if (!s) {
2262     dout(10) << __func__ << " no up osd on our session map" << dendl;
2263     return;
2264   }
2265
2266   dout(10) << "committed, telling random " << s->name
2267            << " all about it" << dendl;
2268
2269   // get feature of the peer
2270   // use quorum_con_features, if it's an anonymous connection.
2271   uint64_t features = s->con_features ? s->con_features :
2272                                         mon.get_quorum_con_features();
2273   // whatev, they'll request more if they need it
2274   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2275   s->con->send_message(m);
2276   // NOTE: do *not* record osd has up to this epoch (as we do
2277   // elsewhere) as they may still need to request older values.
2278 }
2279
2280 version_t OSDMonitor::get_trim_to() const
2281 {
2282   if (mon.get_quorum().empty()) {
2283     dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2284     return 0;
2285   }
2286
2287   {
2288     std::lock_guard<std::mutex> l(creating_pgs_lock);
2289     if (!creating_pgs.pgs.empty()) {
2290       dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2291       return 0;
2292     }
2293   }
2294
2295   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2296     dout(0) << __func__
2297             << " blocking osdmap trim"
2298             << " ('mon_debug_block_osdmap_trim' set to 'true')"
2299             << " trim_to = 0" << dendl;
2300     return 0;
2301   }
2302
2303   {
2304     epoch_t floor = get_min_last_epoch_clean();
2305     dout(10) << " min_last_epoch_clean " << floor << dendl;
2306     if (g_conf()->mon_osd_force_trim_to > 0 &&
2307         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2308       floor = g_conf()->mon_osd_force_trim_to;
2309       dout(10) << __func__
2310                << " explicit mon_osd_force_trim_to = " << floor << dendl;
2311     }
2312     unsigned min = g_conf()->mon_min_osdmap_epochs;
2313     if (floor + min > get_last_committed()) {
2314       if (min < get_last_committed())
2315         floor = get_last_committed() - min;
2316       else
2317         floor = 0;
2318     }
2319     if (floor > get_first_committed()) {
2320       dout(10) << __func__ << " trim_to = " << floor << dendl;
2321       return floor;
2322     }
2323   }
2324   dout(10) << __func__ << " trim_to = 0" << dendl;
2325   return 0;
2326 }
2327
2328 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2329 {
2330   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2331   // also scan osd epochs
2332   // don't trim past the oldest reported osd epoch
2333   for (auto [osd, epoch] : osd_epochs) {
2334     if (epoch < floor) {
2335       floor = epoch;
2336     }
2337   }
2338   return floor;
2339 }
2340
2341 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2342                                    version_t first)
2343 {
2344   dout(10) << __func__ << " including full map for e " << first << dendl;
2345   bufferlist bl;
2346   get_version_full(first, bl);
2347   put_version_full(tx, first, bl);
2348
2349   if (has_osdmap_manifest &&
2350       first > osdmap_manifest.get_first_pinned()) {
2351     _prune_update_trimmed(tx, first);
2352   }
2353 }
2354
2355
2356 /* full osdmap prune
2357  *
2358  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2359  */
2360
2361 void OSDMonitor::load_osdmap_manifest()
2362 {
2363   bool store_has_manifest =
2364     mon.store->exists(get_service_name(), "osdmap_manifest");
2365
2366   if (!store_has_manifest) {
2367     if (!has_osdmap_manifest) {
2368       return;
2369     }
2370
2371     dout(20) << __func__
2372              << " dropping osdmap manifest from memory." << dendl;
2373     osdmap_manifest = osdmap_manifest_t();
2374     has_osdmap_manifest = false;
2375     return;
2376   }
2377
2378   dout(20) << __func__
2379            << " osdmap manifest detected in store; reload." << dendl;
2380
2381   bufferlist manifest_bl;
2382   int r = get_value("osdmap_manifest", manifest_bl);
2383   if (r < 0) {
2384     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2385     ceph_abort_msg("error reading manifest");
2386   }
2387   osdmap_manifest.decode(manifest_bl);
2388   has_osdmap_manifest = true;
2389
2390   dout(10) << __func__ << " store osdmap manifest pinned ("
2391            << osdmap_manifest.get_first_pinned()
2392            << " .. "
2393            << osdmap_manifest.get_last_pinned()
2394            << ")"
2395            << dendl;
2396 }
2397
2398 bool OSDMonitor::should_prune() const
2399 {
2400   version_t first = get_first_committed();
2401   version_t last = get_last_committed();
2402   version_t min_osdmap_epochs =
2403     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2404   version_t prune_min =
2405     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2406   version_t prune_interval =
2407     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2408   version_t last_pinned = osdmap_manifest.get_last_pinned();
2409   version_t last_to_pin = last - min_osdmap_epochs;
2410
2411   // Make it or break it constraints.
2412   //
2413   // If any of these conditions fails, we will not prune, regardless of
2414   // whether we have an on-disk manifest with an on-going pruning state.
2415   //
2416   if ((last - first) <= min_osdmap_epochs) {
2417     // between the first and last committed epochs, we don't have
2418     // enough epochs to trim, much less to prune.
2419     dout(10) << __func__
2420              << " currently holding only " << (last - first)
2421              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2422              << "); do not prune."
2423              << dendl;
2424     return false;
2425
2426   } else if ((last_to_pin - first) < prune_min) {
2427     // between the first committed epoch and the last epoch we would prune,
2428     // we simply don't have enough versions over the minimum to prune maps.
2429     dout(10) << __func__
2430              << " could only prune " << (last_to_pin - first)
2431              << " epochs (" << first << ".." << last_to_pin << "), which"
2432                 " is less than the required minimum (" << prune_min << ")"
2433              << dendl;
2434     return false;
2435
2436   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2437     dout(10) << __func__
2438              << " we have pruned as far as we can; do not prune."
2439              << dendl;
2440     return false;
2441
2442   } else if (last_pinned + prune_interval > last_to_pin) {
2443     dout(10) << __func__
2444              << " not enough epochs to form an interval (last pinned: "
2445              << last_pinned << ", last to pin: "
2446              << last_to_pin << ", interval: " << prune_interval << ")"
2447              << dendl;
2448     return false;
2449   }
2450
2451   dout(15) << __func__
2452            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2453            << " lc (" << first << ".." << last << ")"
2454            << dendl;
2455   return true;
2456 }
2457
2458 void OSDMonitor::_prune_update_trimmed(
2459     MonitorDBStore::TransactionRef tx,
2460     version_t first)
2461 {
2462   dout(10) << __func__
2463            << " first " << first
2464            << " last_pinned " << osdmap_manifest.get_last_pinned()
2465            << dendl;
2466
2467   osdmap_manifest_t manifest = osdmap_manifest;
2468
2469   if (!manifest.is_pinned(first)) {
2470     manifest.pin(first);
2471   }
2472
2473   set<version_t>::iterator p_end = manifest.pinned.find(first);
2474   set<version_t>::iterator p = manifest.pinned.begin();
2475   manifest.pinned.erase(p, p_end);
2476   ceph_assert(manifest.get_first_pinned() == first);
2477
2478   if (manifest.get_last_pinned() == first+1 ||
2479       manifest.pinned.size() == 1) {
2480     // we reached the end of the line, as pinned maps go; clean up our
2481     // manifest, and let `should_prune()` decide whether we should prune
2482     // again.
2483     tx->erase(get_service_name(), "osdmap_manifest");
2484     return;
2485   }
2486
2487   bufferlist bl;
2488   manifest.encode(bl);
2489   tx->put(get_service_name(), "osdmap_manifest", bl);
2490 }
2491
2492 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2493 {
2494   dout(1) << __func__ << dendl;
2495
2496   version_t pin_first;
2497
2498   // verify constrainsts on stable in-memory state
2499   if (!has_osdmap_manifest) {
2500     // we must have never pruned, OR if we pruned the state must no longer
2501     // be relevant (i.e., the state must have been removed alongside with
2502     // the trim that *must* have removed past the last pinned map in a
2503     // previous prune).
2504     ceph_assert(osdmap_manifest.pinned.empty());
2505     ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2506     pin_first = get_first_committed();
2507
2508   } else {
2509     // we must have pruned in the past AND its state is still relevant
2510     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2511     // and thus we still hold a manifest in the store).
2512     ceph_assert(!osdmap_manifest.pinned.empty());
2513     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2514     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2515
2516     dout(10) << __func__
2517              << " first_pinned " << osdmap_manifest.get_first_pinned()
2518              << " last_pinned " << osdmap_manifest.get_last_pinned()
2519              << dendl;
2520
2521     pin_first = osdmap_manifest.get_last_pinned();
2522   }
2523
2524   manifest.pin(pin_first);
2525 }
2526
2527 bool OSDMonitor::_prune_sanitize_options() const
2528 {
2529   uint64_t prune_interval =
2530     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2531   uint64_t prune_min =
2532     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2533   uint64_t txsize =
2534     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2535
2536   bool r = true;
2537
2538   if (prune_interval == 0) {
2539     derr << __func__
2540          << " prune is enabled BUT prune interval is zero; abort."
2541          << dendl;
2542     r = false;
2543   } else if (prune_interval == 1) {
2544     derr << __func__
2545          << " prune interval is equal to one, which essentially means"
2546             " no pruning; abort."
2547          << dendl;
2548     r = false;
2549   }
2550   if (prune_min == 0) {
2551     derr << __func__
2552          << " prune is enabled BUT prune min is zero; abort."
2553          << dendl;
2554     r = false;
2555   }
2556   if (prune_interval > prune_min) {
2557     derr << __func__
2558          << " impossible to ascertain proper prune interval because"
2559          << " it is greater than the minimum prune epochs"
2560          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2561          << dendl;
2562     r = false;
2563   }
2564
2565   if (txsize < prune_interval - 1) {
2566     derr << __func__
2567          << " 'mon_osdmap_full_prune_txsize' (" << txsize
2568          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2569          << "); abort." << dendl;
2570     r = false;
2571   }
2572   return r;
2573 }
2574
2575 bool OSDMonitor::is_prune_enabled() const {
2576   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2577 }
2578
2579 bool OSDMonitor::is_prune_supported() const {
2580   return mon.get_required_mon_features().contains_any(
2581       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2582 }
2583
2584 /** do_prune
2585  *
2586  * @returns true if has side-effects; false otherwise.
2587  */
2588 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2589 {
2590   bool enabled = is_prune_enabled();
2591
2592   dout(1) << __func__ << " osdmap full prune "
2593           << ( enabled ? "enabled" : "disabled")
2594           << dendl;
2595
2596   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2597     return false;
2598   }
2599
2600   // we are beyond the minimum prune versions, we need to remove maps because
2601   // otherwise the store will grow unbounded and we may end up having issues
2602   // with available disk space or store hangs.
2603
2604   // we will not pin all versions. We will leave a buffer number of versions.
2605   // this allows us the monitor to trim maps without caring too much about
2606   // pinned maps, and then allow us to use another ceph-mon without these
2607   // capabilities, without having to repair the store.
2608
2609   osdmap_manifest_t manifest = osdmap_manifest;
2610
2611   version_t first = get_first_committed();
2612   version_t last = get_last_committed();
2613
2614   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2615   version_t last_pinned = manifest.get_last_pinned();
2616   uint64_t prune_interval =
2617     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2618   uint64_t txsize =
2619     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2620
2621   prune_init(manifest);
2622
2623   // we need to get rid of some osdmaps
2624
2625   dout(5) << __func__
2626           << " lc (" << first << " .. " << last << ")"
2627           << " last_pinned " << last_pinned
2628           << " interval " << prune_interval
2629           << " last_to_pin " << last_to_pin
2630           << dendl;
2631
2632   // We will be erasing maps as we go.
2633   //
2634   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2635   //
2636   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2637   // we stop pruning. We could prune the maps between `next_to_pin` and
2638   // `last_to_pin`, but by not doing it we end up with neater pruned
2639   // intervals, aligned with `prune_interval`. Besides, this should not be a
2640   // problem as long as `prune_interval` is set to a sane value, instead of
2641   // hundreds or thousands of maps.
2642
2643   auto map_exists = [this](version_t v) {
2644     string k = mon.store->combine_strings("full", v);
2645     return mon.store->exists(get_service_name(), k);
2646   };
2647
2648   // 'interval' represents the number of maps from the last pinned
2649   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2650   // version 11 next; all intermediate versions will be removed.
2651   //
2652   // 'txsize' represents the maximum number of versions we'll be removing in
2653   // this iteration. If 'txsize' is large enough to perform multiple passes
2654   // pinning and removing maps, we will do so; if not, we'll do at least one
2655   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2656   // ensure that we never go *over* the maximum.
2657
2658   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2659   uint64_t removal_interval = prune_interval - 1;
2660
2661   if (txsize < removal_interval) {
2662     dout(5) << __func__
2663             << " setting txsize to removal interval size ("
2664             << removal_interval << " versions"
2665             << dendl;
2666     txsize = removal_interval;
2667   }
2668   ceph_assert(removal_interval > 0);
2669
2670   uint64_t num_pruned = 0;
2671   while (num_pruned + removal_interval <= txsize) {
2672     last_pinned = manifest.get_last_pinned();
2673
2674     if (last_pinned + prune_interval > last_to_pin) {
2675       break;
2676     }
2677     ceph_assert(last_pinned < last_to_pin);
2678
2679     version_t next_pinned = last_pinned + prune_interval;
2680     ceph_assert(next_pinned <= last_to_pin);
2681     manifest.pin(next_pinned);
2682
2683     dout(20) << __func__
2684              << " last_pinned " << last_pinned
2685              << " next_pinned " << next_pinned
2686              << " num_pruned " << num_pruned
2687              << " removal interval (" << (last_pinned+1)
2688              << ".." << (next_pinned-1) << ")"
2689              << " txsize " << txsize << dendl;
2690
2691     ceph_assert(map_exists(last_pinned));
2692     ceph_assert(map_exists(next_pinned));
2693
2694     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2695       ceph_assert(!manifest.is_pinned(v));
2696
2697       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2698       string full_key = mon.store->combine_strings("full", v);
2699       tx->erase(get_service_name(), full_key);
2700       ++num_pruned;
2701     }
2702   }
2703
2704   ceph_assert(num_pruned > 0);
2705
2706   bufferlist bl;
2707   manifest.encode(bl);
2708   tx->put(get_service_name(), "osdmap_manifest", bl);
2709
2710   return true;
2711 }
2712
2713
2714 // -------------
2715
2716 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2717 {
2718   op->mark_osdmon_event(__func__);
2719   Message *m = op->get_req();
2720   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2721
2722   switch (m->get_type()) {
2723     // READs
2724   case MSG_MON_COMMAND:
2725     try {
2726       return preprocess_command(op);
2727     } catch (const bad_cmd_get& e) {
2728       bufferlist bl;
2729       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2730       return true;
2731     }
2732   case CEPH_MSG_MON_GET_OSDMAP:
2733     return preprocess_get_osdmap(op);
2734
2735     // damp updates
2736   case MSG_OSD_MARK_ME_DOWN:
2737     return preprocess_mark_me_down(op);
2738   case MSG_OSD_MARK_ME_DEAD:
2739     return preprocess_mark_me_dead(op);
2740   case MSG_OSD_FULL:
2741     return preprocess_full(op);
2742   case MSG_OSD_FAILURE:
2743     return preprocess_failure(op);
2744   case MSG_OSD_BOOT:
2745     return preprocess_boot(op);
2746   case MSG_OSD_ALIVE:
2747     return preprocess_alive(op);
2748   case MSG_OSD_PG_CREATED:
2749     return preprocess_pg_created(op);
2750   case MSG_OSD_PG_READY_TO_MERGE:
2751     return preprocess_pg_ready_to_merge(op);
2752   case MSG_OSD_PGTEMP:
2753     return preprocess_pgtemp(op);
2754   case MSG_OSD_BEACON:
2755     return preprocess_beacon(op);
2756
2757   case CEPH_MSG_POOLOP:
2758     return preprocess_pool_op(op);
2759
2760   case MSG_REMOVE_SNAPS:
2761     return preprocess_remove_snaps(op);
2762
2763   case MSG_MON_GET_PURGED_SNAPS:
2764     return preprocess_get_purged_snaps(op);
2765
2766   default:
2767     ceph_abort();
2768     return true;
2769   }
2770 }
2771
2772 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2773 {
2774   op->mark_osdmon_event(__func__);
2775   Message *m = op->get_req();
2776   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2777
2778   switch (m->get_type()) {
2779     // damp updates
2780   case MSG_OSD_MARK_ME_DOWN:
2781     return prepare_mark_me_down(op);
2782   case MSG_OSD_MARK_ME_DEAD:
2783     return prepare_mark_me_dead(op);
2784   case MSG_OSD_FULL:
2785     return prepare_full(op);
2786   case MSG_OSD_FAILURE:
2787     return prepare_failure(op);
2788   case MSG_OSD_BOOT:
2789     return prepare_boot(op);
2790   case MSG_OSD_ALIVE:
2791     return prepare_alive(op);
2792   case MSG_OSD_PG_CREATED:
2793     return prepare_pg_created(op);
2794   case MSG_OSD_PGTEMP:
2795     return prepare_pgtemp(op);
2796   case MSG_OSD_PG_READY_TO_MERGE:
2797     return prepare_pg_ready_to_merge(op);
2798   case MSG_OSD_BEACON:
2799     return prepare_beacon(op);
2800
2801   case MSG_MON_COMMAND:
2802     try {
2803       return prepare_command(op);
2804     } catch (const bad_cmd_get& e) {
2805       bufferlist bl;
2806       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2807       return true;
2808     }
2809
2810   case CEPH_MSG_POOLOP:
2811     return prepare_pool_op(op);
2812
2813   case MSG_REMOVE_SNAPS:
2814     return prepare_remove_snaps(op);
2815
2816
2817   default:
2818     ceph_abort();
2819   }
2820
2821   return false;
2822 }
2823
2824 bool OSDMonitor::should_propose(double& delay)
2825 {
2826   dout(10) << "should_propose" << dendl;
2827
2828   // if full map, propose immediately!  any subsequent changes will be clobbered.
2829   if (pending_inc.fullmap.length())
2830     return true;
2831
2832   // adjust osd weights?
2833   if (!osd_weight.empty() &&
2834       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2835     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2836     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2837     delay = 0.0;
2838     osd_weight.clear();
2839     return true;
2840   }
2841
2842   return PaxosService::should_propose(delay);
2843 }
2844
2845
2846
2847 // ---------------------------
2848 // READs
2849
2850 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2851 {
2852   op->mark_osdmon_event(__func__);
2853   auto m = op->get_req<MMonGetOSDMap>();
2854
2855   uint64_t features = mon.get_quorum_con_features();
2856   if (op->get_session() && op->get_session()->con_features)
2857     features = op->get_session()->con_features;
2858
2859   dout(10) << __func__ << " " << *m << dendl;
2860   MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2861   epoch_t first = get_first_committed();
2862   epoch_t last = osdmap.get_epoch();
2863   int max = g_conf()->osd_map_message_max;
2864   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2865   for (epoch_t e = std::max(first, m->get_full_first());
2866        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2867        ++e, --max) {
2868     bufferlist& bl = reply->maps[e];
2869     int r = get_version_full(e, features, bl);
2870     ceph_assert(r >= 0);
2871     max_bytes -= bl.length();
2872   }
2873   for (epoch_t e = std::max(first, m->get_inc_first());
2874        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2875        ++e, --max) {
2876     bufferlist& bl = reply->incremental_maps[e];
2877     int r = get_version(e, features, bl);
2878     ceph_assert(r >= 0);
2879     max_bytes -= bl.length();
2880   }
2881   reply->oldest_map = first;
2882   reply->newest_map = last;
2883   mon.send_reply(op, reply);
2884   return true;
2885 }
2886
2887
2888 // ---------------------------
2889 // UPDATEs
2890
2891 // failure --
2892
2893 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2894   // check permissions
2895   MonSession *session = op->get_session();
2896   if (!session)
2897     return true;
2898   if (!session->is_capable("osd", MON_CAP_X)) {
2899     dout(0) << "got MOSDFailure from entity with insufficient caps "
2900             << session->caps << dendl;
2901     return true;
2902   }
2903   if (fsid != mon.monmap->fsid) {
2904     dout(0) << "check_source: on fsid " << fsid
2905             << " != " << mon.monmap->fsid << dendl;
2906     return true;
2907   }
2908   return false;
2909 }
2910
2911
2912 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2913 {
2914   op->mark_osdmon_event(__func__);
2915   auto m = op->get_req<MOSDFailure>();
2916   // who is target_osd
2917   int badboy = m->get_target_osd();
2918
2919   // check permissions
2920   if (check_source(op, m->fsid))
2921     goto didit;
2922
2923   // first, verify the reporting host is valid
2924   if (m->get_orig_source().is_osd()) {
2925     int from = m->get_orig_source().num();
2926     if (!osdmap.exists(from) ||
2927         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2928         (osdmap.is_down(from) && m->if_osd_failed())) {
2929       dout(5) << "preprocess_failure from dead osd." << from
2930               << ", ignoring" << dendl;
2931       send_incremental(op, m->get_epoch()+1);
2932       goto didit;
2933     }
2934   }
2935
2936
2937   // weird?
2938   if (osdmap.is_down(badboy)) {
2939     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2940             << " " << m->get_target_addrs()
2941             << ", from " << m->get_orig_source() << dendl;
2942     if (m->get_epoch() < osdmap.get_epoch())
2943       send_incremental(op, m->get_epoch()+1);
2944     goto didit;
2945   }
2946   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2947     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2948             << " " << m->get_target_addrs()
2949             << " != map's " << osdmap.get_addrs(badboy)
2950             << ", from " << m->get_orig_source() << dendl;
2951     if (m->get_epoch() < osdmap.get_epoch())
2952       send_incremental(op, m->get_epoch()+1);
2953     goto didit;
2954   }
2955
2956   // already reported?
2957   if (osdmap.is_down(badboy) ||
2958       osdmap.get_up_from(badboy) > m->get_epoch()) {
2959     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2960             << " " << m->get_target_addrs()
2961             << ", from " << m->get_orig_source() << dendl;
2962     if (m->get_epoch() < osdmap.get_epoch())
2963       send_incremental(op, m->get_epoch()+1);
2964     goto didit;
2965   }
2966
2967   if (!can_mark_down(badboy)) {
2968     dout(5) << "preprocess_failure ignoring report of osd."
2969             << m->get_target_osd() << " " << m->get_target_addrs()
2970             << " from " << m->get_orig_source() << dendl;
2971     goto didit;
2972   }
2973
2974   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2975            << " " << m->get_target_addrs()
2976            << ", from " << m->get_orig_source() << dendl;
2977   return false;
2978
2979  didit:
2980   mon.no_reply(op);
2981   return true;
2982 }
2983
2984 class C_AckMarkedDown : public C_MonOp {
2985   OSDMonitor *osdmon;
2986 public:
2987   C_AckMarkedDown(
2988     OSDMonitor *osdmon,
2989     MonOpRequestRef op)
2990     : C_MonOp(op), osdmon(osdmon) {}
2991
2992   void _finish(int r) override {
2993     if (r == 0) {
2994       auto m = op->get_req<MOSDMarkMeDown>();
2995       osdmon->mon.send_reply(
2996         op,
2997         new MOSDMarkMeDown(
2998           m->fsid,
2999           m->target_osd,
3000           m->target_addrs,
3001           m->get_epoch(),
3002           false));   // ACK itself does not request an ack
3003     } else if (r == -EAGAIN) {
3004         osdmon->dispatch(op);
3005     } else {
3006         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
3007     }
3008   }
3009   ~C_AckMarkedDown() override {
3010   }
3011 };
3012
3013 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
3014 {
3015   op->mark_osdmon_event(__func__);
3016   auto m = op->get_req<MOSDMarkMeDown>();
3017   int from = m->target_osd;
3018
3019   // check permissions
3020   if (check_source(op, m->fsid))
3021     goto reply;
3022
3023   // first, verify the reporting host is valid
3024   if (!m->get_orig_source().is_osd())
3025     goto reply;
3026
3027   if (!osdmap.exists(from) ||
3028       osdmap.is_down(from) ||
3029       osdmap.get_addrs(from) != m->target_addrs) {
3030     dout(5) << "preprocess_mark_me_down from dead osd."
3031             << from << ", ignoring" << dendl;
3032     send_incremental(op, m->get_epoch()+1);
3033     goto reply;
3034   }
3035
3036   // no down might be set
3037   if (!can_mark_down(from))
3038     goto reply;
3039
3040   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3041            << " " << m->target_addrs << dendl;
3042   return false;
3043
3044  reply:
3045   if (m->request_ack) {
3046     Context *c(new C_AckMarkedDown(this, op));
3047     c->complete(0);
3048   }
3049   return true;
3050 }
3051
3052 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3053 {
3054   op->mark_osdmon_event(__func__);
3055   auto m = op->get_req<MOSDMarkMeDown>();
3056   int target_osd = m->target_osd;
3057
3058   ceph_assert(osdmap.is_up(target_osd));
3059   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3060
3061   mon.clog->info() << "osd." << target_osd << " marked itself " << ((m->down_and_dead) ? "down and dead" : "down");
3062   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3063   if (m->down_and_dead) {
3064     if (!pending_inc.new_xinfo.count(target_osd)) {
3065       pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3066     }
3067     pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3068   }
3069   if (m->request_ack)
3070     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3071   return true;
3072 }
3073
3074 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3075 {
3076   op->mark_osdmon_event(__func__);
3077   auto m = op->get_req<MOSDMarkMeDead>();
3078   int from = m->target_osd;
3079
3080   // check permissions
3081   if (check_source(op, m->fsid)) {
3082     mon.no_reply(op);
3083     return true;
3084   }
3085
3086   // first, verify the reporting host is valid
3087   if (!m->get_orig_source().is_osd()) {
3088     mon.no_reply(op);
3089     return true;
3090   }
3091
3092   if (!osdmap.exists(from) ||
3093       !osdmap.is_down(from)) {
3094     dout(5) << __func__ << " from nonexistent or up osd." << from
3095             << ", ignoring" << dendl;
3096     send_incremental(op, m->get_epoch()+1);
3097     mon.no_reply(op);
3098     return true;
3099   }
3100
3101   return false;
3102 }
3103
3104 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3105 {
3106   op->mark_osdmon_event(__func__);
3107   auto m = op->get_req<MOSDMarkMeDead>();
3108   int target_osd = m->target_osd;
3109
3110   ceph_assert(osdmap.is_down(target_osd));
3111
3112   mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3113                     << m->get_epoch();
3114   if (!pending_inc.new_xinfo.count(target_osd)) {
3115     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3116   }
3117   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3118   wait_for_finished_proposal(
3119     op,
3120     new LambdaContext(
3121       [op, this] (int r) {
3122         if (r >= 0) {
3123           mon.no_reply(op);       // ignore on success
3124         }
3125       }
3126       ));
3127   return true;
3128 }
3129
3130 bool OSDMonitor::can_mark_down(int i)
3131 {
3132   if (osdmap.is_nodown(i)) {
3133     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3134             << "will not mark it down" << dendl;
3135     return false;
3136   }
3137
3138   int num_osds = osdmap.get_num_osds();
3139   if (num_osds == 0) {
3140     dout(5) << __func__ << " no osds" << dendl;
3141     return false;
3142   }
3143   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3144   float up_ratio = (float)up / (float)num_osds;
3145   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3146     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3147             << g_conf()->mon_osd_min_up_ratio
3148             << ", will not mark osd." << i << " down" << dendl;
3149     return false;
3150   }
3151   return true;
3152 }
3153
3154 bool OSDMonitor::can_mark_up(int i)
3155 {
3156   if (osdmap.is_noup(i)) {
3157     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3158             << "will not mark it up" << dendl;
3159     return false;
3160   }
3161
3162   return true;
3163 }
3164
3165 /**
3166  * @note the parameter @p i apparently only exists here so we can output the
3167  *       osd's id on messages.
3168  */
3169 bool OSDMonitor::can_mark_out(int i)
3170 {
3171   if (osdmap.is_noout(i)) {
3172     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3173             << "will not mark it out" << dendl;
3174     return false;
3175   }
3176
3177   int num_osds = osdmap.get_num_osds();
3178   if (num_osds == 0) {
3179     dout(5) << __func__ << " no osds" << dendl;
3180     return false;
3181   }
3182   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3183   float in_ratio = (float)in / (float)num_osds;
3184   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3185     if (i >= 0)
3186       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3187               << g_conf()->mon_osd_min_in_ratio
3188               << ", will not mark osd." << i << " out" << dendl;
3189     else
3190       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3191               << g_conf()->mon_osd_min_in_ratio
3192               << ", will not mark osds out" << dendl;
3193     return false;
3194   }
3195
3196   return true;
3197 }
3198
3199 bool OSDMonitor::can_mark_in(int i)
3200 {
3201   if (osdmap.is_noin(i)) {
3202     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3203             << "will not mark it in" << dendl;
3204     return false;
3205   }
3206
3207   return true;
3208 }
3209
3210 bool OSDMonitor::check_failures(utime_t now)
3211 {
3212   bool found_failure = false;
3213   auto p = failure_info.begin();
3214   while (p != failure_info.end()) {
3215     auto& [target_osd, fi] = *p;
3216     if (can_mark_down(target_osd) &&
3217         check_failure(now, target_osd, fi)) {
3218       found_failure = true;
3219       ++p;
3220     } else if (is_failure_stale(now, fi)) {
3221       dout(10) << " dropping stale failure_info for osd." << target_osd
3222                << " from " << fi.reporters.size() << " reporters"
3223                << dendl;
3224       p = failure_info.erase(p);
3225     } else {
3226       ++p;
3227     }
3228   }
3229   return found_failure;
3230 }
3231
3232 utime_t OSDMonitor::get_grace_time(utime_t now,
3233                                    int target_osd,
3234                                    failure_info_t& fi) const
3235 {
3236   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3237   if (!g_conf()->mon_osd_adjust_heartbeat_grace) {
3238     return orig_grace;
3239   }
3240   utime_t grace = orig_grace;
3241   double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3242   double decay_k = ::log(.5) / halflife;
3243
3244   // scale grace period based on historical probability of 'lagginess'
3245   // (false positive failures due to slowness).
3246   const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3247   const utime_t failed_for = now - fi.get_failed_since();
3248   double decay = exp((double)failed_for * decay_k);
3249   dout(20) << " halflife " << halflife << " decay_k " << decay_k
3250            << " failed_for " << failed_for << " decay " << decay << dendl;
3251   double my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3252   grace += my_grace;
3253
3254   // consider the peers reporting a failure a proxy for a potential
3255   // 'subcluster' over the overall cluster that is similarly
3256   // laggy.  this is clearly not true in all cases, but will sometimes
3257   // help us localize the grace correction to a subset of the system
3258   // (say, a rack with a bad switch) that is unhappy.
3259   double peer_grace = 0;
3260   for (auto& [reporter, report] : fi.reporters) {
3261     if (osdmap.exists(reporter)) {
3262       const osd_xinfo_t& xi = osdmap.get_xinfo(reporter);
3263       utime_t elapsed = now - xi.down_stamp;
3264       double decay = exp((double)elapsed * decay_k);
3265       peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3266     }
3267   }
3268   peer_grace /= (double)fi.reporters.size();
3269   grace += peer_grace;
3270   dout(10) << " osd." << target_osd << " has "
3271            << fi.reporters.size() << " reporters, "
3272            << grace << " grace (" << orig_grace << " + " << my_grace
3273            << " + " << peer_grace << "), max_failed_since " << fi.get_failed_since()
3274            << dendl;
3275
3276   return grace;
3277 }
3278
3279 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3280 {
3281   // already pending failure?
3282   if (pending_inc.new_state.count(target_osd) &&
3283       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3284     dout(10) << " already pending failure" << dendl;
3285     return true;
3286   }
3287
3288   set<string> reporters_by_subtree;
3289   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3290   ceph_assert(fi.reporters.size());
3291   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3292     // get the parent bucket whose type matches with "reporter_subtree_level".
3293     // fall back to OSD if the level doesn't exist.
3294     if (osdmap.exists(p->first)) {
3295       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3296       if (auto iter = reporter_loc.find(reporter_subtree_level);
3297           iter == reporter_loc.end()) {
3298         reporters_by_subtree.insert("osd." + to_string(p->first));
3299       } else {
3300         reporters_by_subtree.insert(iter->second);
3301       }
3302       ++p;
3303     } else {
3304       fi.cancel_report(p->first);;
3305       p = fi.reporters.erase(p);
3306     }
3307   }
3308   if (reporters_by_subtree.size() < g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3309     return false;
3310   }
3311   const utime_t failed_for = now - fi.get_failed_since();
3312   const utime_t grace = get_grace_time(now, target_osd, fi);
3313   if (failed_for >= grace) {
3314     dout(1) << " we have enough reporters to mark osd." << target_osd
3315             << " down" << dendl;
3316     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3317
3318     mon.clog->info() << "osd." << target_osd << " failed ("
3319                       << osdmap.crush->get_full_location_ordered_string(
3320                         target_osd)
3321                       << ") ("
3322                       << (int)reporters_by_subtree.size()
3323                       << " reporters from different "
3324                       << reporter_subtree_level << " after "
3325                       << failed_for << " >= grace " << grace << ")";
3326     return true;
3327   }
3328   return false;
3329 }
3330
3331 bool OSDMonitor::is_failure_stale(utime_t now, failure_info_t& fi) const
3332 {
3333   // if it takes too long to either cancel the report to mark the osd down,
3334   // some reporters must have failed to cancel their reports. let's just
3335   // forget these reports.
3336   const utime_t failed_for = now - fi.get_failed_since();
3337   auto heartbeat_grace = cct->_conf.get_val<int64_t>("osd_heartbeat_grace");
3338   auto heartbeat_stale = cct->_conf.get_val<int64_t>("osd_heartbeat_stale");
3339   return failed_for >= (heartbeat_grace + heartbeat_stale);
3340 }
3341
3342 void OSDMonitor::force_failure(int target_osd, int by)
3343 {
3344   // already pending failure?
3345   if (pending_inc.new_state.count(target_osd) &&
3346       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3347     dout(10) << " already pending failure" << dendl;
3348     return;
3349   }
3350
3351   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3352   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3353   if (!pending_inc.new_xinfo.count(target_osd)) {
3354     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3355   }
3356   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3357
3358   mon.clog->info() << "osd." << target_osd << " failed ("
3359                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3360                     << ") (connection refused reported by osd." << by << ")";
3361   return;
3362 }
3363
3364 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3365 {
3366   op->mark_osdmon_event(__func__);
3367   auto m = op->get_req<MOSDFailure>();
3368   dout(1) << "prepare_failure osd." << m->get_target_osd()
3369           << " " << m->get_target_addrs()
3370           << " from " << m->get_orig_source()
3371           << " is reporting failure:" << m->if_osd_failed() << dendl;
3372
3373   int target_osd = m->get_target_osd();
3374   int reporter = m->get_orig_source().num();
3375   ceph_assert(osdmap.is_up(target_osd));
3376   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3377
3378   mon.no_reply(op);
3379
3380   if (m->if_osd_failed()) {
3381     // calculate failure time
3382     utime_t now = ceph_clock_now();
3383     utime_t failed_since =
3384       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3385
3386     // add a report
3387     if (m->is_immediate()) {
3388       mon.clog->debug() << "osd." << m->get_target_osd()
3389                          << " reported immediately failed by "
3390                          << m->get_orig_source();
3391       force_failure(target_osd, reporter);
3392       return true;
3393     }
3394     mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3395                       << m->get_orig_source();
3396
3397     failure_info_t& fi = failure_info[target_osd];
3398     fi.add_report(reporter, failed_since, op);
3399     return check_failure(now, target_osd, fi);
3400   } else {
3401     // remove the report
3402     mon.clog->debug() << "osd." << m->get_target_osd()
3403                        << " failure report canceled by "
3404                        << m->get_orig_source();
3405     if (failure_info.count(target_osd)) {
3406       failure_info_t& fi = failure_info[target_osd];
3407       fi.cancel_report(reporter);
3408       if (fi.reporters.empty()) {
3409         dout(10) << " removing last failure_info for osd." << target_osd
3410                  << dendl;
3411         failure_info.erase(target_osd);
3412       } else {
3413         dout(10) << " failure_info for osd." << target_osd << " now "
3414                  << fi.reporters.size() << " reporters" << dendl;
3415       }
3416     } else {
3417       dout(10) << " no failure_info for osd." << target_osd << dendl;
3418     }
3419   }
3420
3421   return false;
3422 }
3423
3424 void OSDMonitor::process_failures()
3425 {
3426   map<int,failure_info_t>::iterator p = failure_info.begin();
3427   while (p != failure_info.end()) {
3428     if (osdmap.is_up(p->first)) {
3429       ++p;
3430     } else {
3431       dout(10) << "process_failures osd." << p->first << dendl;
3432       list<MonOpRequestRef> ls;
3433       p->second.take_report_messages(ls);
3434       failure_info.erase(p++);
3435
3436       while (!ls.empty()) {
3437         MonOpRequestRef o = ls.front();
3438         if (o) {
3439           o->mark_event(__func__);
3440           MOSDFailure *m = o->get_req<MOSDFailure>();
3441           send_latest(o, m->get_epoch());
3442           mon.no_reply(o);
3443         }
3444         ls.pop_front();
3445       }
3446     }
3447   }
3448 }
3449
3450 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3451 {
3452   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3453
3454   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3455        p != failure_info.end();
3456        ++p) {
3457     p->second.take_report_messages(ls);
3458   }
3459   failure_info.clear();
3460 }
3461
3462 int OSDMonitor::get_grace_interval_threshold()
3463 {
3464   int halflife = g_conf()->mon_osd_laggy_halflife;
3465   // Scale the halflife period (default: 1_hr) by
3466   // a factor (48) to calculate the threshold.
3467   int grace_threshold_factor = 48;
3468   return halflife * grace_threshold_factor;
3469 }
3470
3471 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3472 {
3473   int grace_interval_threshold_secs = get_grace_interval_threshold();
3474   if (last_failed_interval > grace_interval_threshold_secs) {
3475     dout(1) << " last_failed_interval " << last_failed_interval
3476             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3477             << dendl;
3478     return true;
3479   }
3480   return false;
3481 }
3482
3483 void OSDMonitor::set_default_laggy_params(int target_osd)
3484 {
3485   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3486     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3487   }
3488   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3489   xi.down_stamp = pending_inc.modified;
3490   xi.laggy_probability = 0.0;
3491   xi.laggy_interval = 0;
3492   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3493 }
3494
3495
3496 // boot --
3497
3498 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3499 {
3500   op->mark_osdmon_event(__func__);
3501   auto m = op->get_req<MOSDBoot>();
3502   int from = m->get_orig_source_inst().name.num();
3503
3504   // check permissions, ignore if failed (no response expected)
3505   MonSession *session = op->get_session();
3506   if (!session)
3507     goto ignore;
3508   if (!session->is_capable("osd", MON_CAP_X)) {
3509     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3510             << session->caps << dendl;
3511     goto ignore;
3512   }
3513
3514   if (m->sb.cluster_fsid != mon.monmap->fsid) {
3515     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3516             << " != " << mon.monmap->fsid << dendl;
3517     goto ignore;
3518   }
3519
3520   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3521     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3522     goto ignore;
3523   }
3524
3525   ceph_assert(m->get_orig_source_inst().name.is_osd());
3526
3527   // lower bound of N-2
3528   if (!HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS)) {
3529     mon.clog->info() << "disallowing boot of OSD "
3530                      << m->get_orig_source_inst()
3531                      << " because the osd lacks CEPH_FEATURE_SERVER_OCTOPUS";
3532     goto ignore;
3533   }
3534
3535   // make sure osd versions do not span more than 3 releases
3536   if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3537       osdmap.require_osd_release < ceph_release_t::nautilus) {
3538     mon.clog->info() << "disallowing boot of pacific+ OSD "
3539                       << m->get_orig_source_inst()
3540                       << " because require_osd_release < nautilus";
3541     goto ignore;
3542   }
3543   if (HAVE_FEATURE(m->osd_features, SERVER_QUINCY) &&
3544       osdmap.require_osd_release < ceph_release_t::octopus) {
3545     mon.clog->info() << "disallowing boot of quincy+ OSD "
3546                       << m->get_orig_source_inst()
3547                       << " because require_osd_release < octopus";
3548     goto ignore;
3549   }
3550
3551   if (osdmap.stretch_mode_enabled &&
3552       !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3553     mon.clog->info() << "disallowing boot of OSD "
3554                       << m->get_orig_source_inst()
3555                       << " because stretch mode is on and OSD lacks support";
3556     goto ignore;
3557   }
3558
3559   // already booted?
3560   if (osdmap.is_up(from) &&
3561       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3562       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3563     // yup.
3564     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3565             << " " << m->get_orig_source_addrs()
3566             << " =~ " << osdmap.get_addrs(from) << dendl;
3567     _booted(op, false);
3568     return true;
3569   }
3570
3571   if (osdmap.exists(from) &&
3572       !osdmap.get_uuid(from).is_zero() &&
3573       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3574     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3575             << " clashes with existing osd: different fsid"
3576             << " (ours: " << osdmap.get_uuid(from)
3577             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3578     goto ignore;
3579   }
3580
3581   if (osdmap.exists(from) &&
3582       osdmap.get_info(from).up_from > m->version &&
3583       osdmap.get_most_recent_addrs(from).legacy_equals(
3584         m->get_orig_source_addrs())) {
3585     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3586     send_latest(op, m->sb.current_epoch+1);
3587     return true;
3588   }
3589
3590   // noup?
3591   if (!can_mark_up(from)) {
3592     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3593     send_latest(op, m->sb.current_epoch+1);
3594     return true;
3595   }
3596
3597   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3598   return false;
3599
3600  ignore:
3601   return true;
3602 }
3603
3604 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3605 {
3606   op->mark_osdmon_event(__func__);
3607   auto m = op->get_req<MOSDBoot>();
3608   dout(7) << __func__ << " from " << m->get_source()
3609           << " sb " << m->sb
3610           << " client_addrs" << m->get_connection()->get_peer_addrs()
3611           << " cluster_addrs " << m->cluster_addrs
3612           << " hb_back_addrs " << m->hb_back_addrs
3613           << " hb_front_addrs " << m->hb_front_addrs
3614           << dendl;
3615
3616   ceph_assert(m->get_orig_source().is_osd());
3617   int from = m->get_orig_source().num();
3618
3619   // does this osd exist?
3620   if (from >= osdmap.get_max_osd()) {
3621     dout(1) << "boot from osd." << from << " >= max_osd "
3622             << osdmap.get_max_osd() << dendl;
3623     return false;
3624   }
3625
3626   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3627   if (pending_inc.new_state.count(from))
3628     oldstate ^= pending_inc.new_state[from];
3629
3630   // already up?  mark down first?
3631   if (osdmap.is_up(from)) {
3632     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3633             << osdmap.get_addrs(from) << dendl;
3634     // preprocess should have caught these;  if not, assert.
3635     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3636                   m->get_orig_source_addrs()) ||
3637                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3638     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3639
3640     if (pending_inc.new_state.count(from) == 0 ||
3641         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3642       // mark previous guy down
3643       pending_inc.new_state[from] = CEPH_OSD_UP;
3644     }
3645     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3646   } else if (pending_inc.new_up_client.count(from)) {
3647     // already prepared, just wait
3648     dout(7) << __func__ << " already prepared, waiting on "
3649             << m->get_orig_source_addr() << dendl;
3650     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3651   } else {
3652     // mark new guy up.
3653     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3654     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3655     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3656     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3657
3658     down_pending_out.erase(from);  // if any
3659
3660     if (m->sb.weight)
3661       osd_weight[from] = m->sb.weight;
3662
3663     // set uuid?
3664     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3665              << dendl;
3666     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3667       // preprocess should have caught this;  if not, assert.
3668       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3669       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3670     }
3671
3672     // fresh osd?
3673     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3674       const osd_info_t& i = osdmap.get_info(from);
3675       if (i.up_from > i.lost_at) {
3676         dout(10) << " fresh osd; marking lost_at too" << dendl;
3677         pending_inc.new_lost[from] = osdmap.get_epoch();
3678       }
3679     }
3680
3681     // metadata
3682     bufferlist osd_metadata;
3683     encode(m->metadata, osd_metadata);
3684     pending_metadata[from] = osd_metadata;
3685     pending_metadata_rm.erase(from);
3686
3687     // adjust last clean unmount epoch?
3688     const osd_info_t& info = osdmap.get_info(from);
3689     dout(10) << " old osd_info: " << info << dendl;
3690     if (m->sb.mounted > info.last_clean_begin ||
3691         (m->sb.mounted == info.last_clean_begin &&
3692          m->sb.clean_thru > info.last_clean_end)) {
3693       epoch_t begin = m->sb.mounted;
3694       epoch_t end = m->sb.clean_thru;
3695
3696       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3697                << "[" << info.last_clean_begin << "," << info.last_clean_end
3698                << ") -> [" << begin << "-" << end << ")"
3699                << dendl;
3700       pending_inc.new_last_clean_interval[from] =
3701         pair<epoch_t,epoch_t>(begin, end);
3702     }
3703
3704     if (pending_inc.new_xinfo.count(from) == 0)
3705       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3706     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3707     if (m->boot_epoch == 0) {
3708       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3709       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3710       dout(10) << " not laggy, new xi " << xi << dendl;
3711     } else {
3712       if (xi.down_stamp.sec()) {
3713         int interval = ceph_clock_now().sec() -
3714           xi.down_stamp.sec();
3715         if (g_conf()->mon_osd_laggy_max_interval &&
3716             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3717           interval =  g_conf()->mon_osd_laggy_max_interval;
3718         }
3719         xi.laggy_interval =
3720           interval * g_conf()->mon_osd_laggy_weight +
3721           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3722       }
3723       xi.laggy_probability =
3724         g_conf()->mon_osd_laggy_weight +
3725         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3726       dout(10) << " laggy, now xi " << xi << dendl;
3727     }
3728
3729     // set features shared by the osd
3730     if (m->osd_features)
3731       xi.features = m->osd_features;
3732     else
3733       xi.features = m->get_connection()->get_features();
3734
3735     // mark in?
3736     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3737          (oldstate & CEPH_OSD_AUTOOUT)) ||
3738         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3739         (g_conf()->mon_osd_auto_mark_in)) {
3740       if (can_mark_in(from)) {
3741         if (xi.old_weight > 0) {
3742           pending_inc.new_weight[from] = xi.old_weight;
3743           xi.old_weight = 0;
3744         } else {
3745           pending_inc.new_weight[from] = CEPH_OSD_IN;
3746         }
3747       } else {
3748         dout(7) << __func__ << " NOIN set, will not mark in "
3749                 << m->get_orig_source_addr() << dendl;
3750       }
3751     }
3752
3753     // wait
3754     wait_for_finished_proposal(op, new C_Booted(this, op));
3755   }
3756   return true;
3757 }
3758
3759 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3760 {
3761   op->mark_osdmon_event(__func__);
3762   auto m = op->get_req<MOSDBoot>();
3763   dout(7) << "_booted " << m->get_orig_source_inst()
3764           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3765
3766   if (logit) {
3767     mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3768                       << " boot";
3769   }
3770
3771   send_latest(op, m->sb.current_epoch+1);
3772 }
3773
3774
3775 // -------------
3776 // full
3777
3778 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3779 {
3780   op->mark_osdmon_event(__func__);
3781   auto m = op->get_req<MOSDFull>();
3782   int from = m->get_orig_source().num();
3783   set<string> state;
3784   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3785
3786   // check permissions, ignore if failed
3787   MonSession *session = op->get_session();
3788   if (!session)
3789     goto ignore;
3790   if (!session->is_capable("osd", MON_CAP_X)) {
3791     dout(0) << "MOSDFull from entity with insufficient privileges:"
3792             << session->caps << dendl;
3793     goto ignore;
3794   }
3795
3796   // ignore a full message from the osd instance that already went down
3797   if (!osdmap.exists(from)) {
3798     dout(7) << __func__ << " ignoring full message from nonexistent "
3799             << m->get_orig_source_inst() << dendl;
3800     goto ignore;
3801   }
3802   if ((!osdmap.is_up(from) &&
3803        osdmap.get_most_recent_addrs(from).legacy_equals(
3804          m->get_orig_source_addrs())) ||
3805       (osdmap.is_up(from) &&
3806        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3807     dout(7) << __func__ << " ignoring full message from down "
3808             << m->get_orig_source_inst() << dendl;
3809     goto ignore;
3810   }
3811
3812   OSDMap::calc_state_set(osdmap.get_state(from), state);
3813
3814   if ((osdmap.get_state(from) & mask) == m->state) {
3815     dout(7) << __func__ << " state already " << state << " for osd." << from
3816             << " " << m->get_orig_source_inst() << dendl;
3817     _reply_map(op, m->version);
3818     goto ignore;
3819   }
3820
3821   dout(10) << __func__ << " want state " << state << " for osd." << from
3822            << " " << m->get_orig_source_inst() << dendl;
3823   return false;
3824
3825  ignore:
3826   return true;
3827 }
3828
3829 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3830 {
3831   op->mark_osdmon_event(__func__);
3832   auto m = op->get_req<MOSDFull>();
3833   const int from = m->get_orig_source().num();
3834
3835   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3836   const unsigned want_state = m->state & mask;  // safety first
3837
3838   unsigned cur_state = osdmap.get_state(from);
3839   auto p = pending_inc.new_state.find(from);
3840   if (p != pending_inc.new_state.end()) {
3841     cur_state ^= p->second;
3842   }
3843   cur_state &= mask;
3844
3845   set<string> want_state_set, cur_state_set;
3846   OSDMap::calc_state_set(want_state, want_state_set);
3847   OSDMap::calc_state_set(cur_state, cur_state_set);
3848
3849   if (cur_state != want_state) {
3850     if (p != pending_inc.new_state.end()) {
3851       p->second &= ~mask;
3852     } else {
3853       pending_inc.new_state[from] = 0;
3854     }
3855     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3856     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3857             << " -> " << want_state_set << dendl;
3858   } else {
3859     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3860             << " = wanted " << want_state_set << ", just waiting" << dendl;
3861   }
3862
3863   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3864   return true;
3865 }
3866
3867 // -------------
3868 // alive
3869
3870 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3871 {
3872   op->mark_osdmon_event(__func__);
3873   auto m = op->get_req<MOSDAlive>();
3874   int from = m->get_orig_source().num();
3875
3876   // check permissions, ignore if failed
3877   MonSession *session = op->get_session();
3878   if (!session)
3879     goto ignore;
3880   if (!session->is_capable("osd", MON_CAP_X)) {
3881     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3882             << session->caps << dendl;
3883     goto ignore;
3884   }
3885
3886   if (!osdmap.is_up(from) ||
3887       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3888     dout(7) << "preprocess_alive ignoring alive message from down "
3889             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3890             << dendl;
3891     goto ignore;
3892   }
3893
3894   if (osdmap.get_up_thru(from) >= m->want) {
3895     // yup.
3896     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3897     _reply_map(op, m->version);
3898     return true;
3899   }
3900
3901   dout(10) << "preprocess_alive want up_thru " << m->want
3902            << " from " << m->get_orig_source_inst() << dendl;
3903   return false;
3904
3905  ignore:
3906   return true;
3907 }
3908
3909 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3910 {
3911   op->mark_osdmon_event(__func__);
3912   auto m = op->get_req<MOSDAlive>();
3913   int from = m->get_orig_source().num();
3914
3915   if (0) {  // we probably don't care much about these
3916     mon.clog->debug() << m->get_orig_source_inst() << " alive";
3917   }
3918
3919   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3920           << " from " << m->get_orig_source_inst() << dendl;
3921
3922   update_up_thru(from, m->version); // set to the latest map the OSD has
3923   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3924   return true;
3925 }
3926
3927 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3928 {
3929   op->mark_osdmon_event(__func__);
3930   dout(7) << "_reply_map " << e
3931           << " from " << op->get_req()->get_orig_source_inst()
3932           << dendl;
3933   send_latest(op, e);
3934 }
3935
3936 // pg_created
3937 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3938 {
3939   op->mark_osdmon_event(__func__);
3940   auto m  = op->get_req<MOSDPGCreated>();
3941   dout(10) << __func__ << " " << *m << dendl;
3942   auto session = op->get_session();
3943   mon.no_reply(op);
3944   if (!session) {
3945     dout(10) << __func__ << ": no monitor session!" << dendl;
3946     return true;
3947   }
3948   if (!session->is_capable("osd", MON_CAP_X)) {
3949     derr << __func__ << " received from entity "
3950          << "with insufficient privileges " << session->caps << dendl;
3951     return true;
3952   }
3953   // always forward the "created!" to the leader
3954   return false;
3955 }
3956
3957 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3958 {
3959   op->mark_osdmon_event(__func__);
3960   auto m = op->get_req<MOSDPGCreated>();
3961   dout(10) << __func__ << " " << *m << dendl;
3962   auto src = m->get_orig_source();
3963   auto from = src.num();
3964   if (!src.is_osd() ||
3965       !mon.osdmon()->osdmap.is_up(from) ||
3966       !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3967         m->get_orig_source_addrs())) {
3968     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3969     return false;
3970   }
3971   pending_created_pgs.push_back(m->pgid);
3972   return true;
3973 }
3974
3975 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3976 {
3977   op->mark_osdmon_event(__func__);
3978   auto m = op->get_req<MOSDPGReadyToMerge>();
3979   dout(10) << __func__ << " " << *m << dendl;
3980   const pg_pool_t *pi;
3981   auto session = op->get_session();
3982   if (!session) {
3983     dout(10) << __func__ << ": no monitor session!" << dendl;
3984     goto ignore;
3985   }
3986   if (!session->is_capable("osd", MON_CAP_X)) {
3987     derr << __func__ << " received from entity "
3988          << "with insufficient privileges " << session->caps << dendl;
3989     goto ignore;
3990   }
3991   pi = osdmap.get_pg_pool(m->pgid.pool());
3992   if (!pi) {
3993     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3994     goto ignore;
3995   }
3996   if (pi->get_pg_num() <= m->pgid.ps()) {
3997     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3998     goto ignore;
3999   }
4000   if (pi->get_pg_num() != m->pgid.ps() + 1) {
4001     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
4002     goto ignore;
4003   }
4004   if (pi->get_pg_num_pending() > m->pgid.ps()) {
4005     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
4006     goto ignore;
4007   }
4008   return false;
4009
4010  ignore:
4011   mon.no_reply(op);
4012   return true;
4013 }
4014
4015 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
4016 {
4017   op->mark_osdmon_event(__func__);
4018   auto m  = op->get_req<MOSDPGReadyToMerge>();
4019   dout(10) << __func__ << " " << *m << dendl;
4020   pg_pool_t p;
4021   if (pending_inc.new_pools.count(m->pgid.pool()))
4022     p = pending_inc.new_pools[m->pgid.pool()];
4023   else
4024     p = *osdmap.get_pg_pool(m->pgid.pool());
4025   if (p.get_pg_num() != m->pgid.ps() + 1 ||
4026       p.get_pg_num_pending() > m->pgid.ps()) {
4027     dout(10) << __func__
4028              << " race with concurrent pg_num[_pending] update, will retry"
4029              << dendl;
4030     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4031     return true;
4032   }
4033
4034   if (m->ready) {
4035     p.dec_pg_num(m->pgid,
4036                  pending_inc.epoch,
4037                  m->source_version,
4038                  m->target_version,
4039                  m->last_epoch_started,
4040                  m->last_epoch_clean);
4041     p.last_change = pending_inc.epoch;
4042   } else {
4043     // back off the merge attempt!
4044     p.set_pg_num_pending(p.get_pg_num());
4045   }
4046
4047   // force pre-nautilus clients to resend their ops, since they
4048   // don't understand pg_num_pending changes form a new interval
4049   p.last_force_op_resend_prenautilus = pending_inc.epoch;
4050
4051   pending_inc.new_pools[m->pgid.pool()] = p;
4052
4053   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4054   if (m->ready &&
4055       prob > 0 &&
4056       prob > (double)(rand() % 1000)/1000.0) {
4057     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4058     auto n = new MMonCommand(mon.monmap->get_fsid());
4059     n->set_connection(m->get_connection());
4060     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4061                osdmap.get_pool_name(m->pgid.pool()) +
4062                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4063                stringify(m->pgid.ps() + 1) + "\"}" };
4064     MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4065     nop->set_type_service();
4066     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4067   } else {
4068     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4069   }
4070   return true;
4071 }
4072
4073
4074 // -------------
4075 // pg_temp changes
4076
4077 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4078 {
4079   auto m = op->get_req<MOSDPGTemp>();
4080   dout(10) << "preprocess_pgtemp " << *m << dendl;
4081   mempool::osdmap::vector<int> empty;
4082   int from = m->get_orig_source().num();
4083   size_t ignore_cnt = 0;
4084
4085   // check caps
4086   MonSession *session = op->get_session();
4087   if (!session)
4088     goto ignore;
4089   if (!session->is_capable("osd", MON_CAP_X)) {
4090     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4091             << session->caps << dendl;
4092     goto ignore;
4093   }
4094
4095   if (!osdmap.is_up(from) ||
4096       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4097     dout(7) << "ignoring pgtemp message from down "
4098             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4099             << dendl;
4100     goto ignore;
4101   }
4102
4103   if (m->forced) {
4104     return false;
4105   }
4106
4107   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4108     dout(20) << " " << p->first
4109              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4110              << " -> " << p->second << dendl;
4111
4112     // does the pool exist?
4113     if (!osdmap.have_pg_pool(p->first.pool())) {
4114       /*
4115        * 1. If the osdmap does not have the pool, it means the pool has been
4116        *    removed in-between the osd sending this message and us handling it.
4117        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4118        *    not exist in the pending either, as the osds would not send a
4119        *    message about a pool they know nothing about (yet).
4120        * 3. However, if the pool does exist in the pending, then it must be a
4121        *    new pool, and not relevant to this message (see 1).
4122        */
4123       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4124                << ": pool has been removed" << dendl;
4125       ignore_cnt++;
4126       continue;
4127     }
4128
4129     int acting_primary = -1;
4130     osdmap.pg_to_up_acting_osds(
4131       p->first, nullptr, nullptr, nullptr, &acting_primary);
4132     if (acting_primary != from) {
4133       /* If the source isn't the primary based on the current osdmap, we know
4134        * that the interval changed and that we can discard this message.
4135        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4136        * which of two pg temp mappings on the same pg is more recent.
4137        */
4138       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4139                << ": primary has changed" << dendl;
4140       ignore_cnt++;
4141       continue;
4142     }
4143
4144     // removal?
4145     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4146                               osdmap.primary_temp->count(p->first)))
4147       return false;
4148     // change?
4149     //  NOTE: we assume that this will clear pg_primary, so consider
4150     //        an existing pg_primary field to imply a change
4151     if (p->second.size() &&
4152         (osdmap.pg_temp->count(p->first) == 0 ||
4153          osdmap.pg_temp->get(p->first) != p->second ||
4154          osdmap.primary_temp->count(p->first)))
4155       return false;
4156   }
4157
4158   // should we ignore all the pgs?
4159   if (ignore_cnt == m->pg_temp.size())
4160     goto ignore;
4161
4162   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4163   _reply_map(op, m->map_epoch);
4164   return true;
4165
4166  ignore:
4167   mon.no_reply(op);
4168   return true;
4169 }
4170
4171 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4172 {
4173   epoch_t old_up_thru = osdmap.get_up_thru(from);
4174   auto ut = pending_inc.new_up_thru.find(from);
4175   if (ut != pending_inc.new_up_thru.end()) {
4176     old_up_thru = ut->second;
4177   }
4178   if (up_thru > old_up_thru) {
4179     // set up_thru too, so the osd doesn't have to ask again
4180     pending_inc.new_up_thru[from] = up_thru;
4181   }
4182 }
4183
4184 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4185 {
4186   op->mark_osdmon_event(__func__);
4187   auto m = op->get_req<MOSDPGTemp>();
4188   int from = m->get_orig_source().num();
4189   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4190   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4191     uint64_t pool = p->first.pool();
4192     if (pending_inc.old_pools.count(pool)) {
4193       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4194                << ": pool pending removal" << dendl;
4195       continue;
4196     }
4197     if (!osdmap.have_pg_pool(pool)) {
4198       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4199                << ": pool has been removed" << dendl;
4200       continue;
4201     }
4202     pending_inc.new_pg_temp[p->first] =
4203       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4204
4205     // unconditionally clear pg_primary (until this message can encode
4206     // a change for that, too.. at which point we need to also fix
4207     // preprocess_pg_temp)
4208     if (osdmap.primary_temp->count(p->first) ||
4209         pending_inc.new_primary_temp.count(p->first))
4210       pending_inc.new_primary_temp[p->first] = -1;
4211   }
4212
4213   // set up_thru too, so the osd doesn't have to ask again
4214   update_up_thru(from, m->map_epoch);
4215
4216   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4217   return true;
4218 }
4219
4220
4221 // ---
4222
4223 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4224 {
4225   op->mark_osdmon_event(__func__);
4226   auto m = op->get_req<MRemoveSnaps>();
4227   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4228
4229   // check privilege, ignore if failed
4230   MonSession *session = op->get_session();
4231   mon.no_reply(op);
4232   if (!session)
4233     goto ignore;
4234   if (!session->caps.is_capable(
4235         cct,
4236         session->entity_name,
4237         "osd", "osd pool rmsnap", {}, true, true, false,
4238         session->get_peer_socket_addr())) {
4239     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4240             << session->caps << dendl;
4241     goto ignore;
4242   }
4243
4244   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4245        q != m->snaps.end();
4246        ++q) {
4247     if (!osdmap.have_pg_pool(q->first)) {
4248       dout(10) << " ignoring removed_snaps " << q->second
4249                << " on non-existent pool " << q->first << dendl;
4250       continue;
4251     }
4252     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4253     for (vector<snapid_t>::iterator p = q->second.begin();
4254          p != q->second.end();
4255          ++p) {
4256       if (*p > pi->get_snap_seq() ||
4257           !_is_removed_snap(q->first, *p)) {
4258         return false;
4259       }
4260     }
4261   }
4262
4263   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4264     auto reply = make_message<MRemoveSnaps>();
4265     reply->snaps = m->snaps;
4266     mon.send_reply(op, reply.detach());
4267   }
4268
4269  ignore:
4270   return true;
4271 }
4272
4273 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4274 {
4275   op->mark_osdmon_event(__func__);
4276   auto m = op->get_req<MRemoveSnaps>();
4277   dout(7) << "prepare_remove_snaps " << *m << dendl;
4278
4279   for (auto& [pool, snaps] : m->snaps) {
4280     if (!osdmap.have_pg_pool(pool)) {
4281       dout(10) << " ignoring removed_snaps " << snaps
4282                << " on non-existent pool " << pool << dendl;
4283       continue;
4284     }
4285
4286     pg_pool_t& pi = osdmap.pools[pool];
4287     for (auto s : snaps) {
4288       if (!_is_removed_snap(pool, s) &&
4289           (!pending_inc.new_pools.count(pool) ||
4290            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4291           (!pending_inc.new_removed_snaps.count(pool) ||
4292            !pending_inc.new_removed_snaps[pool].contains(s))) {
4293         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4294         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4295           newpi->removed_snaps.insert(s);
4296           dout(10) << " pool " << pool << " removed_snaps added " << s
4297                    << " (now " << newpi->removed_snaps << ")" << dendl;
4298         }
4299         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4300         if (s > newpi->get_snap_seq()) {
4301           dout(10) << " pool " << pool << " snap_seq "
4302                    << newpi->get_snap_seq() << " -> " << s << dendl;
4303           newpi->set_snap_seq(s);
4304         }
4305         newpi->set_snap_epoch(pending_inc.epoch);
4306         dout(10) << " added pool " << pool << " snap " << s
4307                  << " to removed_snaps queue" << dendl;
4308         pending_inc.new_removed_snaps[pool].insert(s);
4309       }
4310     }
4311   }
4312
4313   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4314     auto reply = make_message<MRemoveSnaps>();
4315     reply->snaps = m->snaps;
4316     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4317   }
4318
4319   return true;
4320 }
4321
4322 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4323 {
4324   op->mark_osdmon_event(__func__);
4325   auto m = op->get_req<MMonGetPurgedSnaps>();
4326   dout(7) << __func__ << " " << *m << dendl;
4327
4328   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4329
4330   string k = make_purged_snap_epoch_key(m->start);
4331   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4332   it->upper_bound(k);
4333   unsigned long epoch = m->last;
4334   while (it->valid()) {
4335     if (it->key().find("purged_epoch_") != 0) {
4336       break;
4337     }
4338     string k = it->key();
4339     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4340     if (n != 1) {
4341       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4342     } else if (epoch > m->last) {
4343       break;
4344     } else {
4345       bufferlist bl = it->value();
4346       auto p = bl.cbegin();
4347       auto &v = r[epoch];
4348       try {
4349         ceph::decode(v, p);
4350       } catch (ceph::buffer::error& e) {
4351         derr << __func__ << " unable to parse value for key '" << it->key()
4352              << "': \n";
4353         bl.hexdump(*_dout);
4354         *_dout << dendl;
4355       }
4356       n += 4 + v.size() * 16;
4357     }
4358     if (n > 1048576) {
4359       // impose a semi-arbitrary limit to message size
4360       break;
4361     }
4362     it->next();
4363   }
4364
4365   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4366   reply->purged_snaps.swap(r);
4367   mon.send_reply(op, reply.detach());
4368
4369   return true;
4370 }
4371
4372 // osd beacon
4373 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4374 {
4375   op->mark_osdmon_event(__func__);
4376   // check caps
4377   auto session = op->get_session();
4378   mon.no_reply(op);
4379   if (!session) {
4380     dout(10) << __func__ << " no monitor session!" << dendl;
4381     return true;
4382   }
4383   if (!session->is_capable("osd", MON_CAP_X)) {
4384     derr << __func__ << " received from entity "
4385          << "with insufficient privileges " << session->caps << dendl;
4386     return true;
4387   }
4388   // Always forward the beacon to the leader, even if they are the same as
4389   // the old one. The leader will mark as down osds that haven't sent
4390   // beacon for a few minutes.
4391   return false;
4392 }
4393
4394 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4395 {
4396   op->mark_osdmon_event(__func__);
4397   const auto beacon = op->get_req<MOSDBeacon>();
4398   const auto src = beacon->get_orig_source();
4399   dout(10) << __func__ << " " << *beacon
4400            << " from " << src << dendl;
4401   int from = src.num();
4402
4403   if (!src.is_osd() ||
4404       !osdmap.is_up(from) ||
4405       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4406     if (src.is_osd() && !osdmap.is_up(from)) {
4407       // share some new maps with this guy in case it may not be
4408       // aware of its own deadness...
4409       send_latest(op, beacon->version+1);
4410     }
4411     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4412     return false;
4413   }
4414
4415   last_osd_report[from].first = ceph_clock_now();
4416   last_osd_report[from].second = beacon->osd_beacon_report_interval;
4417   osd_epochs[from] = beacon->version;
4418
4419   for (const auto& pg : beacon->pgs) {
4420     if (auto* pool = osdmap.get_pg_pool(pg.pool()); pool != nullptr) {
4421       unsigned pg_num = pool->get_pg_num();
4422       last_epoch_clean.report(pg_num, pg, beacon->min_last_epoch_clean);
4423     }
4424   }
4425
4426   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4427       beacon->last_purged_snaps_scrub) {
4428     if (pending_inc.new_xinfo.count(from) == 0) {
4429       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4430     }
4431     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4432       beacon->last_purged_snaps_scrub;
4433     return true;
4434   } else {
4435     return false;
4436   }
4437 }
4438
4439 // ---------------
4440 // map helpers
4441
4442 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4443 {
4444   op->mark_osdmon_event(__func__);
4445   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4446           << " start " << start << dendl;
4447   if (start == 0)
4448     send_full(op);
4449   else
4450     send_incremental(op, start);
4451 }
4452
4453
4454 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4455 {
4456   MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4457   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4458   r->oldest_map = get_first_committed();
4459   r->newest_map = osdmap.get_epoch();
4460   return r;
4461 }
4462
4463 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4464 {
4465   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4466            << std::hex << features << std::dec << dendl;
4467   MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4468   m->oldest_map = get_first_committed();
4469   m->newest_map = osdmap.get_epoch();
4470
4471   for (epoch_t e = to; e >= from && e > 0; e--) {
4472     bufferlist bl;
4473     int err = get_version(e, features, bl);
4474     if (err == 0) {
4475       ceph_assert(bl.length());
4476       // if (get_version(e, bl) > 0) {
4477       dout(20) << "build_incremental    inc " << e << " "
4478                << bl.length() << " bytes" << dendl;
4479       m->incremental_maps[e] = bl;
4480     } else {
4481       ceph_assert(err == -ENOENT);
4482       ceph_assert(!bl.length());
4483       get_version_full(e, features, bl);
4484       if (bl.length() > 0) {
4485       //else if (get_version("full", e, bl) > 0) {
4486       dout(20) << "build_incremental   full " << e << " "
4487                << bl.length() << " bytes" << dendl;
4488       m->maps[e] = bl;
4489       } else {
4490         ceph_abort();  // we should have all maps.
4491       }
4492     }
4493   }
4494   return m;
4495 }
4496
4497 void OSDMonitor::send_full(MonOpRequestRef op)
4498 {
4499   op->mark_osdmon_event(__func__);
4500   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4501   mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4502 }
4503
4504 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4505 {
4506   op->mark_osdmon_event(__func__);
4507
4508   MonSession *s = op->get_session();
4509   ceph_assert(s);
4510
4511   if (s->proxy_con) {
4512     // oh, we can tell the other mon to do it
4513     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4514              << first << dendl;
4515     MRoute *r = new MRoute(s->proxy_tid, NULL);
4516     r->send_osdmap_first = first;
4517     s->proxy_con->send_message(r);
4518     op->mark_event("reply: send routed send_osdmap_first reply");
4519   } else {
4520     // do it ourselves
4521     send_incremental(first, s, false, op);
4522   }
4523 }
4524
4525 void OSDMonitor::send_incremental(epoch_t first,
4526                                   MonSession *session,
4527                                   bool onetime,
4528                                   MonOpRequestRef req)
4529 {
4530   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4531           << " to " << session->name << dendl;
4532
4533   // get feature of the peer
4534   // use quorum_con_features, if it's an anonymous connection.
4535   uint64_t features = session->con_features ? session->con_features :
4536     mon.get_quorum_con_features();
4537
4538   if (first <= session->osd_epoch) {
4539     dout(10) << __func__ << " " << session->name << " should already have epoch "
4540              << session->osd_epoch << dendl;
4541     first = session->osd_epoch + 1;
4542   }
4543
4544   if (first < get_first_committed()) {
4545     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4546     m->oldest_map = get_first_committed();
4547     m->newest_map = osdmap.get_epoch();
4548
4549     first = get_first_committed();
4550     bufferlist bl;
4551     int err = get_version_full(first, features, bl);
4552     ceph_assert(err == 0);
4553     ceph_assert(bl.length());
4554     dout(20) << "send_incremental starting with base full "
4555              << first << " " << bl.length() << " bytes" << dendl;
4556     m->maps[first] = bl;
4557
4558     if (req) {
4559       mon.send_reply(req, m);
4560       session->osd_epoch = first;
4561       return;
4562     } else {
4563       session->con->send_message(m);
4564       session->osd_epoch = first;
4565     }
4566     first++;
4567   }
4568
4569   while (first <= osdmap.get_epoch()) {
4570     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4571                                      osdmap.get_epoch());
4572     MOSDMap *m = build_incremental(first, last, features);
4573
4574     if (req) {
4575       // send some maps.  it may not be all of them, but it will get them
4576       // started.
4577       mon.send_reply(req, m);
4578     } else {
4579       session->con->send_message(m);
4580       first = last + 1;
4581     }
4582     session->osd_epoch = last;
4583     if (onetime || req)
4584       break;
4585   }
4586 }
4587
4588 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4589 {
4590   return get_version(ver, mon.get_quorum_con_features(), bl);
4591 }
4592
4593 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4594 {
4595   OSDMap::Incremental inc;
4596   auto q = bl.cbegin();
4597   inc.decode(q);
4598   // always encode with subset of osdmap's canonical features
4599   uint64_t f = features & inc.encode_features;
4600   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4601            << dendl;
4602   bl.clear();
4603   if (inc.fullmap.length()) {
4604     // embedded full map?
4605     OSDMap m;
4606     m.decode(inc.fullmap);
4607     inc.fullmap.clear();
4608     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4609   }
4610   if (inc.crush.length()) {
4611     // embedded crush map
4612     CrushWrapper c;
4613     auto p = inc.crush.cbegin();
4614     c.decode(p);
4615     inc.crush.clear();
4616     c.encode(inc.crush, f);
4617   }
4618   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4619 }
4620
4621 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4622 {
4623   OSDMap m;
4624   auto q = bl.cbegin();
4625   m.decode(q);
4626   // always encode with subset of osdmap's canonical features
4627   uint64_t f = features & m.get_encoding_features();
4628   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4629            << dendl;
4630   bl.clear();
4631   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4632 }
4633
4634 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4635 {
4636   uint64_t significant_features = OSDMap::get_significant_features(features);
4637   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4638     return 0;
4639   }
4640   int ret = PaxosService::get_version(ver, bl);
4641   if (ret < 0) {
4642     return ret;
4643   }
4644   // NOTE: this check is imprecise; the OSDMap encoding features may
4645   // be a subset of the latest mon quorum features, but worst case we
4646   // reencode once and then cache the (identical) result under both
4647   // feature masks.
4648   if (significant_features !=
4649       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4650     reencode_incremental_map(bl, features);
4651   }
4652   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4653   return 0;
4654 }
4655
4656 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4657 {
4658   bufferlist inc_bl;
4659   int err = get_version(ver, inc_bl);
4660   ceph_assert(err == 0);
4661   ceph_assert(inc_bl.length());
4662
4663   auto p = inc_bl.cbegin();
4664   inc.decode(p);
4665   dout(10) << __func__ << "     "
4666            << " epoch " << inc.epoch
4667            << " inc_crc " << inc.inc_crc
4668            << " full_crc " << inc.full_crc
4669            << " encode_features " << inc.encode_features << dendl;
4670   return 0;
4671 }
4672
4673 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4674 {
4675   dout(10) << __func__ << " ver " << ver << dendl;
4676
4677   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4678   if (closest_pinned == 0) {
4679     return -ENOENT;
4680   }
4681   if (closest_pinned > ver) {
4682     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4683   }
4684   ceph_assert(closest_pinned <= ver);
4685
4686   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4687
4688   // get osdmap incremental maps and apply on top of this one.
4689   bufferlist osdm_bl;
4690   bool has_cached_osdmap = false;
4691   for (version_t v = ver-1; v >= closest_pinned; --v) {
4692     if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4693                                 &osdm_bl)) {
4694       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4695       closest_pinned = v;
4696       has_cached_osdmap = true;
4697       break;
4698     }
4699   }
4700
4701   if (!has_cached_osdmap) {
4702     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4703     if (err != 0) {
4704       derr << __func__ << " closest pinned map ver " << closest_pinned
4705            << " not available! error: " << cpp_strerror(err) << dendl;
4706     }
4707     ceph_assert(err == 0);
4708   }
4709
4710   ceph_assert(osdm_bl.length());
4711
4712   OSDMap osdm;
4713   osdm.decode(osdm_bl);
4714
4715   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4716            << " e" << osdm.epoch
4717            << " crc " << osdm.get_crc()
4718            << " -- applying incremental maps." << dendl;
4719
4720   uint64_t encode_features = 0;
4721   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4722     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4723
4724     OSDMap::Incremental inc;
4725     int err = get_inc(v, inc);
4726     ceph_assert(err == 0);
4727
4728     encode_features = inc.encode_features;
4729
4730     err = osdm.apply_incremental(inc);
4731     ceph_assert(err == 0);
4732
4733     // this block performs paranoid checks on map retrieval
4734     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4735         inc.full_crc != 0) {
4736
4737       uint64_t f = encode_features;
4738       if (!f) {
4739         f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4740       }
4741
4742       // encode osdmap to force calculating crcs
4743       bufferlist tbl;
4744       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4745       // decode osdmap to compare crcs with what's expected by incremental
4746       OSDMap tosdm;
4747       tosdm.decode(tbl);
4748
4749       if (tosdm.get_crc() != inc.full_crc) {
4750         derr << __func__
4751              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4752              << ", expected " << inc.full_crc << ")" << dendl;
4753         ceph_abort_msg("osdmap crc mismatch");
4754       }
4755     }
4756
4757     // note: we cannot add the recently computed map to the cache, as is,
4758     // because we have not encoded the map into a bl.
4759   }
4760
4761   if (!encode_features) {
4762     dout(10) << __func__
4763              << " last incremental map didn't have features;"
4764              << " defaulting to quorum's or all" << dendl;
4765     encode_features =
4766       (mon.quorum_con_features ? mon.quorum_con_features : -1);
4767   }
4768   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4769
4770   return 0;
4771 }
4772
4773 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4774 {
4775   return get_version_full(ver, mon.get_quorum_con_features(), bl);
4776 }
4777
4778 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4779                                  bufferlist& bl)
4780 {
4781   uint64_t significant_features = OSDMap::get_significant_features(features);
4782   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4783     return 0;
4784   }
4785   int ret = PaxosService::get_version_full(ver, bl);
4786   if (ret == -ENOENT) {
4787     // build map?
4788     ret = get_full_from_pinned_map(ver, bl);
4789   }
4790   if (ret < 0) {
4791     return ret;
4792   }
4793   // NOTE: this check is imprecise; the OSDMap encoding features may
4794   // be a subset of the latest mon quorum features, but worst case we
4795   // reencode once and then cache the (identical) result under both
4796   // feature masks.
4797   if (significant_features !=
4798       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4799     reencode_full_map(bl, features);
4800   }
4801   full_osd_cache.add_bytes({ver, significant_features}, bl);
4802   return 0;
4803 }
4804
4805 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4806 {
4807   dout(10) << "blocklist " << av << " until " << until << dendl;
4808   for (auto a : av.v) {
4809     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4810       a.set_type(entity_addr_t::TYPE_ANY);
4811     } else {
4812       a.set_type(entity_addr_t::TYPE_LEGACY);
4813     }
4814     pending_inc.new_blocklist[a] = until;
4815   }
4816   return pending_inc.epoch;
4817 }
4818
4819 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4820 {
4821   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4822     a.set_type(entity_addr_t::TYPE_ANY);
4823   } else {
4824     a.set_type(entity_addr_t::TYPE_LEGACY);
4825   }
4826   dout(10) << "blocklist " << a << " until " << until << dendl;
4827   pending_inc.new_blocklist[a] = until;
4828   return pending_inc.epoch;
4829 }
4830
4831
4832 void OSDMonitor::check_osdmap_subs()
4833 {
4834   dout(10) << __func__ << dendl;
4835   if (!osdmap.get_epoch()) {
4836     return;
4837   }
4838   auto osdmap_subs = mon.session_map.subs.find("osdmap");
4839   if (osdmap_subs == mon.session_map.subs.end()) {
4840     return;
4841   }
4842   auto p = osdmap_subs->second->begin();
4843   while (!p.end()) {
4844     auto sub = *p;
4845     ++p;
4846     check_osdmap_sub(sub);
4847   }
4848 }
4849
4850 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4851 {
4852   dout(10) << __func__ << " " << sub << " next " << sub->next
4853            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4854   if (sub->next <= osdmap.get_epoch()) {
4855     if (sub->next >= 1)
4856       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4857     else
4858       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4859     if (sub->onetime)
4860       mon.session_map.remove_sub(sub);
4861     else
4862       sub->next = osdmap.get_epoch() + 1;
4863   }
4864 }
4865
4866 void OSDMonitor::check_pg_creates_subs()
4867 {
4868   if (!osdmap.get_num_up_osds()) {
4869     return;
4870   }
4871   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4872   mon.with_session_map([this](const MonSessionMap& session_map) {
4873       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4874       if (pg_creates_subs == session_map.subs.end()) {
4875         return;
4876       }
4877       for (auto sub : *pg_creates_subs->second) {
4878         check_pg_creates_sub(sub);
4879       }
4880     });
4881 }
4882
4883 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4884 {
4885   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4886   ceph_assert(sub->type == "osd_pg_creates");
4887   // only send these if the OSD is up.  we will check_subs() when they do
4888   // come up so they will get the creates then.
4889   if (sub->session->name.is_osd() &&
4890       mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4891     sub->next = send_pg_creates(sub->session->name.num(),
4892                                 sub->session->con.get(),
4893                                 sub->next);
4894   }
4895 }
4896
4897 void OSDMonitor::do_application_enable(int64_t pool_id,
4898                                        const std::string &app_name,
4899                                        const std::string &app_key,
4900                                        const std::string &app_value,
4901                                        bool force)
4902 {
4903   ceph_assert(paxos.is_plugged() && is_writeable());
4904
4905   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4906            << dendl;
4907
4908   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4909
4910   auto pp = osdmap.get_pg_pool(pool_id);
4911   ceph_assert(pp != nullptr);
4912
4913   pg_pool_t p = *pp;
4914   if (pending_inc.new_pools.count(pool_id)) {
4915     p = pending_inc.new_pools[pool_id];
4916   }
4917
4918   if (app_key.empty()) {
4919     p.application_metadata.insert({app_name, {}});
4920   } else {
4921     if (force) {
4922       p.application_metadata[app_name][app_key] = app_value;
4923     } else {
4924       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4925     }
4926   }
4927   p.last_change = pending_inc.epoch;
4928   pending_inc.new_pools[pool_id] = p;
4929 }
4930
4931 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4932                                  pool_opts_t::key_t opt,
4933                                  pool_opts_t::value_t val)
4934 {
4935   dout(10) << __func__ << " pool: " << pool_id << " option: " << opt
4936            << " val: " << val << dendl;
4937   auto p = pending_inc.new_pools.try_emplace(
4938     pool_id, *osdmap.get_pg_pool(pool_id));
4939   p.first->second.opts.set(opt, val);
4940 }
4941
4942 unsigned OSDMonitor::scan_for_creating_pgs(
4943   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4944   const mempool::osdmap::set<int64_t>& removed_pools,
4945   utime_t modified,
4946   creating_pgs_t* creating_pgs) const
4947 {
4948   unsigned queued = 0;
4949   for (auto& p : pools) {
4950     int64_t poolid = p.first;
4951     if (creating_pgs->created_pools.count(poolid)) {
4952       dout(10) << __func__ << " already created " << poolid << dendl;
4953       continue;
4954     }
4955     const pg_pool_t& pool = p.second;
4956     int ruleno = pool.get_crush_rule();
4957     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4958       continue;
4959
4960     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4961     const auto created = pool.get_last_change();
4962     if (last_scan_epoch && created <= last_scan_epoch) {
4963       dout(10) << __func__ << " no change in pool " << poolid
4964                << " " << pool << dendl;
4965       continue;
4966     }
4967     if (removed_pools.count(poolid)) {
4968       dout(10) << __func__ << " pool is being removed: " << poolid
4969                << " " << pool << dendl;
4970       continue;
4971     }
4972     dout(10) << __func__ << " queueing pool create for " << poolid
4973              << " " << pool << dendl;
4974     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4975                               created, modified);
4976     queued++;
4977   }
4978   return queued;
4979 }
4980
4981 void OSDMonitor::update_creating_pgs()
4982 {
4983   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4984            << creating_pgs.queue.size() << " pools in queue" << dendl;
4985   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4986   std::lock_guard<std::mutex> l(creating_pgs_lock);
4987   for (const auto& pg : creating_pgs.pgs) {
4988     int acting_primary = -1;
4989     auto pgid = pg.first;
4990     if (!osdmap.pg_exists(pgid)) {
4991       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4992                << dendl;
4993       continue;
4994     }
4995     auto mapped = pg.second.create_epoch;
4996     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4997     spg_t spgid(pgid);
4998     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4999     // check the previous creating_pgs, look for the target to whom the pg was
5000     // previously mapped
5001     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
5002       const auto last_acting_primary = pgs_by_epoch.first;
5003       for (auto& pgs: pgs_by_epoch.second) {
5004         if (pgs.second.count(spgid)) {
5005           if (last_acting_primary == acting_primary) {
5006             mapped = pgs.first;
5007           } else {
5008             dout(20) << __func__ << " " << pgid << " "
5009                      << " acting_primary:" << last_acting_primary
5010                      << " -> " << acting_primary << dendl;
5011             // note epoch if the target of the create message changed.
5012             mapped = mapping.get_epoch();
5013           }
5014           break;
5015         } else {
5016           // newly creating
5017           mapped = mapping.get_epoch();
5018         }
5019       }
5020     }
5021     dout(10) << __func__ << " will instruct osd." << acting_primary
5022              << " to create " << pgid << "@" << mapped << dendl;
5023     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5024   }
5025   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5026   creating_pgs_epoch = mapping.get_epoch();
5027 }
5028
5029 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5030 {
5031   dout(30) << __func__ << " osd." << osd << " next=" << next
5032            << " " << creating_pgs_by_osd_epoch << dendl;
5033   std::lock_guard<std::mutex> l(creating_pgs_lock);
5034   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5035     dout(20) << __func__
5036              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5037     // the subscribers will be updated when the mapping is completed anyway
5038     return next;
5039   }
5040   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5041   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5042     return next;
5043   ceph_assert(!creating_pgs_by_epoch->second.empty());
5044
5045   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5046   MOSDPGCreate2 *m = nullptr;
5047
5048   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5049
5050   epoch_t last = 0;
5051   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5052        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5053     auto epoch = epoch_pgs->first;
5054     auto& pgs = epoch_pgs->second;
5055     dout(20) << __func__ << " osd." << osd << " from " << next
5056              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5057     last = epoch;
5058     for (auto& pg : pgs) {
5059       // Need the create time from the monitor using its clock to set
5060       // last_scrub_stamp upon pg creation.
5061       auto create = creating_pgs.pgs.find(pg.pgid);
5062       ceph_assert(create != creating_pgs.pgs.end());
5063       if (old) {
5064         if (!oldm) {
5065           oldm = new MOSDPGCreate(creating_pgs_epoch);
5066         }
5067         oldm->mkpg.emplace(pg.pgid,
5068                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
5069         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5070       } else {
5071         if (!m) {
5072           m = new MOSDPGCreate2(creating_pgs_epoch);
5073         }
5074         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5075                                      create->second.create_stamp));
5076         if (create->second.history.epoch_created) {
5077           dout(20) << __func__ << "   " << pg << " " << create->second.history
5078                    << " " << create->second.past_intervals << dendl;
5079           m->pg_extra.emplace(pg, make_pair(create->second.history,
5080                                             create->second.past_intervals));
5081         }
5082       }
5083       dout(20) << __func__ << " will create " << pg
5084                << " at " << create->second.create_epoch << dendl;
5085     }
5086   }
5087   if (m) {
5088     con->send_message(m);
5089   } else if (oldm) {
5090     con->send_message(oldm);
5091   } else {
5092     dout(20) << __func__ << " osd." << osd << " from " << next
5093              << " has nothing to send" << dendl;
5094     return next;
5095   }
5096
5097   // sub is current through last + 1
5098   return last + 1;
5099 }
5100
5101 // TICK
5102
5103
5104 void OSDMonitor::tick()
5105 {
5106   if (!is_active()) return;
5107
5108   dout(10) << osdmap << dendl;
5109
5110   // always update osdmap manifest, regardless of being the leader.
5111   load_osdmap_manifest();
5112
5113   // always tune priority cache manager memory on leader and peons
5114   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5115     std::lock_guard l(balancer_lock);
5116     if (pcm != nullptr) {
5117       pcm->tune_memory();
5118       pcm->balance();
5119       _set_new_cache_sizes();
5120       dout(10) << "tick balancer "
5121                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5122                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5123                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5124                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5125                << dendl;
5126       dout(10) << "tick balancer "
5127                << " full cache_bytes: " << full_cache->get_cache_bytes()
5128                << " full comtd_bytes: " << full_cache->get_committed_size()
5129                << " full used_bytes: " << full_cache->_get_used_bytes()
5130                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5131                << dendl;
5132     }
5133   }
5134
5135   if (!mon.is_leader()) return;
5136
5137   bool do_propose = false;
5138   utime_t now = ceph_clock_now();
5139
5140   if (handle_osd_timeouts(now, last_osd_report)) {
5141     do_propose = true;
5142   }
5143
5144   // mark osds down?
5145   if (check_failures(now)) {
5146     do_propose = true;
5147   }
5148
5149   // Force a proposal if we need to prune; pruning is performed on
5150   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5151   // even if there's nothing going on.
5152   if (is_prune_enabled() && should_prune()) {
5153     do_propose = true;
5154   }
5155
5156   // mark down osds out?
5157
5158   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5159    * influence at all. The decision is made based on the ratio of "in" osds,
5160    * and the function returns false if this ratio is lower that the minimum
5161    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5162    */
5163   if (can_mark_out(-1)) {
5164     string down_out_subtree_limit = g_conf().get_val<string>(
5165       "mon_osd_down_out_subtree_limit");
5166     set<int> down_cache;  // quick cache of down subtrees
5167
5168     map<int,utime_t>::iterator i = down_pending_out.begin();
5169     while (i != down_pending_out.end()) {
5170       int o = i->first;
5171       utime_t down = now;
5172       down -= i->second;
5173       ++i;
5174
5175       if (osdmap.is_down(o) &&
5176           osdmap.is_in(o) &&
5177           can_mark_out(o)) {
5178         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5179         utime_t grace = orig_grace;
5180         double my_grace = 0.0;
5181
5182         if (g_conf()->mon_osd_adjust_down_out_interval) {
5183           // scale grace period the same way we do the heartbeat grace.
5184           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5185           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5186           double decay_k = ::log(.5) / halflife;
5187           double decay = exp((double)down * decay_k);
5188           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5189                    << " down for " << down << " decay " << decay << dendl;
5190           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5191           grace += my_grace;
5192         }
5193
5194         // is this an entire large subtree down?
5195         if (down_out_subtree_limit.length()) {
5196           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5197           if (type > 0) {
5198             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5199               dout(10) << "tick entire containing " << down_out_subtree_limit
5200                        << " subtree for osd." << o
5201                        << " is down; resetting timer" << dendl;
5202               // reset timer, too.
5203               down_pending_out[o] = now;
5204               continue;
5205             }
5206           }
5207         }
5208
5209         bool down_out = !osdmap.is_destroyed(o) &&
5210           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5211         bool destroyed_out = osdmap.is_destroyed(o) &&
5212           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5213         // this is not precise enough as we did not make a note when this osd
5214         // was marked as destroyed, but let's not bother with that
5215         // complexity for now.
5216           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5217         if (down_out || destroyed_out) {
5218           dout(10) << "tick marking osd." << o << " OUT after " << down
5219                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5220           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5221
5222           // set the AUTOOUT bit.
5223           if (pending_inc.new_state.count(o) == 0)
5224             pending_inc.new_state[o] = 0;
5225           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5226
5227           // remember previous weight
5228           if (pending_inc.new_xinfo.count(o) == 0)
5229             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5230           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5231
5232           do_propose = true;
5233
5234           mon.clog->info() << "Marking osd." << o << " out (has been down for "
5235                             << int(down.sec()) << " seconds)";
5236         } else
5237           continue;
5238       }
5239
5240       down_pending_out.erase(o);
5241     }
5242   } else {
5243     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5244   }
5245
5246   // expire blocklisted items?
5247   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5248        p != osdmap.blocklist.end();
5249        ++p) {
5250     if (p->second < now) {
5251       dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5252       pending_inc.old_blocklist.push_back(p->first);
5253       do_propose = true;
5254     }
5255   }
5256   for (auto p = osdmap.range_blocklist.begin();
5257        p != osdmap.range_blocklist.end();
5258        ++p) {
5259     if (p->second < now) {
5260       dout(10) << "expiring range_blocklist item " << p->first
5261                << " expired " << p->second << " < now " << now << dendl;
5262       pending_inc.old_range_blocklist.push_back(p->first);
5263       do_propose = true;
5264     }
5265   }
5266
5267   if (try_prune_purged_snaps()) {
5268     do_propose = true;
5269   }
5270
5271   if (update_pools_status())
5272     do_propose = true;
5273
5274   if (do_propose ||
5275       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5276     propose_pending();
5277 }
5278
5279 void OSDMonitor::_set_new_cache_sizes()
5280 {
5281   uint64_t cache_size = 0;
5282   int64_t inc_alloc = 0;
5283   int64_t full_alloc = 0;
5284   int64_t kv_alloc = 0;
5285
5286   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5287     cache_size = pcm->get_tuned_mem();
5288     inc_alloc = inc_cache->get_committed_size();
5289     full_alloc = full_cache->get_committed_size();
5290     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5291   }
5292
5293   inc_osd_cache.set_bytes(inc_alloc);
5294   full_osd_cache.set_bytes(full_alloc);
5295
5296   dout(1) << __func__ << " cache_size:" << cache_size
5297            << " inc_alloc: " << inc_alloc
5298            << " full_alloc: " << full_alloc
5299            << " kv_alloc: " << kv_alloc
5300            << dendl;
5301 }
5302
5303 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5304                                      std::map<int, std::pair<utime_t, int>> &last_osd_report)
5305 {
5306   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5307   if (now - mon.get_leader_since() < timeo) {
5308     // We haven't been the leader for long enough to consider OSD timeouts
5309     return false;
5310   }
5311
5312   int max_osd = osdmap.get_max_osd();
5313   bool new_down = false;
5314
5315   for (int i=0; i < max_osd; ++i) {
5316     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5317     if (!osdmap.exists(i)) {
5318       last_osd_report.erase(i); // if any
5319       continue;
5320     }
5321     if (!osdmap.is_up(i))
5322       continue;
5323     const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5324     if (t == last_osd_report.end()) {
5325       // it wasn't in the map; start the timer.
5326       last_osd_report[i].first = now;
5327       last_osd_report[i].second = 0;
5328     } else if (can_mark_down(i)) {
5329       utime_t diff = now - t->second.first;
5330       // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5331       // to allow for the osd to miss a beacon.
5332       int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5333       utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
5334       if (diff > max_timeout) {
5335         mon.clog->info() << "osd." << i << " marked down after no beacon for "
5336                           << diff << " seconds";
5337         derr << "no beacon from osd." << i << " since " << t->second.first
5338              << ", " << diff << " seconds ago.  marking down" << dendl;
5339         pending_inc.new_state[i] = CEPH_OSD_UP;
5340         new_down = true;
5341       }
5342     }
5343   }
5344   return new_down;
5345 }
5346
5347 static void dump_cpu_list(Formatter *f, const char *name,
5348                           const string& strlist)
5349 {
5350   cpu_set_t cpu_set;
5351   size_t cpu_set_size;
5352   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5353     return;
5354   }
5355   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5356   f->open_array_section(name);
5357   for (auto cpu : cpus) {
5358     f->dump_int("cpu", cpu);
5359   }
5360   f->close_section();
5361 }
5362
5363 void OSDMonitor::dump_info(Formatter *f)
5364 {
5365   f->open_object_section("osdmap");
5366   osdmap.dump(f);
5367   f->close_section();
5368
5369   f->open_array_section("osd_metadata");
5370   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5371     if (osdmap.exists(i)) {
5372       f->open_object_section("osd");
5373       f->dump_unsigned("id", i);
5374       dump_osd_metadata(i, f, NULL);
5375       f->close_section();
5376     }
5377   }
5378   f->close_section();
5379
5380   f->open_object_section("osdmap_clean_epochs");
5381   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5382
5383   f->open_object_section("last_epoch_clean");
5384   last_epoch_clean.dump(f);
5385   f->close_section();
5386
5387   f->open_array_section("osd_epochs");
5388   for (auto& osd_epoch : osd_epochs) {
5389     f->open_object_section("osd");
5390     f->dump_unsigned("id", osd_epoch.first);
5391     f->dump_unsigned("epoch", osd_epoch.second);
5392     f->close_section();
5393   }
5394   f->close_section(); // osd_epochs
5395
5396   f->close_section(); // osd_clean_epochs
5397
5398   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5399   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5400
5401   f->open_object_section("crushmap");
5402   osdmap.crush->dump(f);
5403   f->close_section();
5404
5405   if (has_osdmap_manifest) {
5406     f->open_object_section("osdmap_manifest");
5407     osdmap_manifest.dump(f);
5408     f->close_section();
5409   }
5410 }
5411
5412 namespace {
5413   enum osd_pool_get_choices {
5414     SIZE, MIN_SIZE,
5415     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5416     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5417     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5418     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5419     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5420     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5421     CACHE_TARGET_FULL_RATIO,
5422     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5423     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5424     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5425     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5426     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5427     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5428     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5429     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5430     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5431     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5432     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5433     DEDUP_CDC_CHUNK_SIZE, POOL_EIO, BULK, PG_NUM_MAX };
5434
5435   std::set<osd_pool_get_choices>
5436     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5437                                 const std::set<osd_pool_get_choices>& second)
5438     {
5439       std::set<osd_pool_get_choices> result;
5440       std::set_difference(first.begin(), first.end(),
5441                           second.begin(), second.end(),
5442                           std::inserter(result, result.end()));
5443       return result;
5444     }
5445 }
5446
5447
5448 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5449 {
5450   op->mark_osdmon_event(__func__);
5451   auto m = op->get_req<MMonCommand>();
5452   int r = 0;
5453   bufferlist rdata;
5454   stringstream ss, ds;
5455
5456   cmdmap_t cmdmap;
5457   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5458     string rs = ss.str();
5459     mon.reply_command(op, -EINVAL, rs, get_last_committed());
5460     return true;
5461   }
5462
5463   MonSession *session = op->get_session();
5464   if (!session) {
5465     derr << __func__ << " no session" << dendl;
5466     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5467     return true;
5468   }
5469
5470   string prefix;
5471   cmd_getval(cmdmap, "prefix", prefix);
5472
5473   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
5474   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5475
5476   if (prefix == "osd stat") {
5477     if (f) {
5478       f->open_object_section("osdmap");
5479       osdmap.print_summary(f.get(), ds, "", true);
5480       f->close_section();
5481       f->flush(rdata);
5482     } else {
5483       osdmap.print_summary(nullptr, ds, "", true);
5484       rdata.append(ds);
5485     }
5486   }
5487   else if (prefix == "osd dump" ||
5488            prefix == "osd tree" ||
5489            prefix == "osd tree-from" ||
5490            prefix == "osd ls" ||
5491            prefix == "osd getmap" ||
5492            prefix == "osd getcrushmap" ||
5493            prefix == "osd ls-tree" ||
5494            prefix == "osd info") {
5495
5496     epoch_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", osdmap.get_epoch());
5497     bufferlist osdmap_bl;
5498     int err = get_version_full(epoch, osdmap_bl);
5499     if (err == -ENOENT) {
5500       r = -ENOENT;
5501       ss << "there is no map for epoch " << epoch;
5502       goto reply;
5503     }
5504     ceph_assert(err == 0);
5505     ceph_assert(osdmap_bl.length());
5506
5507     OSDMap *p;
5508     if (epoch == osdmap.get_epoch()) {
5509       p = &osdmap;
5510     } else {
5511       p = new OSDMap;
5512       p->decode(osdmap_bl);
5513     }
5514
5515     auto sg = make_scope_guard([&] {
5516       if (p != &osdmap) {
5517         delete p;
5518       }
5519     });
5520
5521     if (prefix == "osd dump") {
5522       stringstream ds;
5523       if (f) {
5524         f->open_object_section("osdmap");
5525         p->dump(f.get());
5526         f->close_section();
5527         f->flush(ds);
5528       } else {
5529         p->print(ds);
5530       }
5531       rdata.append(ds);
5532       if (!f)
5533         ds << " ";
5534     } else if (prefix == "osd ls") {
5535       if (f) {
5536         f->open_array_section("osds");
5537         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5538           if (osdmap.exists(i)) {
5539             f->dump_int("osd", i);
5540           }
5541         }
5542         f->close_section();
5543         f->flush(ds);
5544       } else {
5545         bool first = true;
5546         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5547           if (osdmap.exists(i)) {
5548             if (!first)
5549               ds << "\n";
5550             first = false;
5551             ds << i;
5552           }
5553         }
5554       }
5555       rdata.append(ds);
5556     } else if (prefix == "osd info") {
5557       int64_t osd_id;
5558       bool do_single_osd = true;
5559       if (!cmd_getval(cmdmap, "id", osd_id)) {
5560         do_single_osd = false;
5561       }
5562
5563       if (do_single_osd && !osdmap.exists(osd_id)) {
5564         ss << "osd." << osd_id << " does not exist";
5565         r = -EINVAL;
5566         goto reply;
5567       }
5568
5569       if (f) {
5570         if (do_single_osd) {
5571           osdmap.dump_osd(osd_id, f.get());
5572         } else {
5573           osdmap.dump_osds(f.get());
5574         }
5575         f->flush(ds);
5576       } else {
5577         if (do_single_osd) {
5578           osdmap.print_osd(osd_id, ds);
5579         } else {
5580           osdmap.print_osds(ds);
5581         }
5582       }
5583       rdata.append(ds);
5584     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5585       string bucket;
5586       if (prefix == "osd tree-from") {
5587         cmd_getval(cmdmap, "bucket", bucket);
5588         if (!osdmap.crush->name_exists(bucket)) {
5589           ss << "bucket '" << bucket << "' does not exist";
5590           r = -ENOENT;
5591           goto reply;
5592         }
5593         int id = osdmap.crush->get_item_id(bucket);
5594         if (id >= 0) {
5595           ss << "\"" << bucket << "\" is not a bucket";
5596           r = -EINVAL;
5597           goto reply;
5598         }
5599       }
5600
5601       vector<string> states;
5602       cmd_getval(cmdmap, "states", states);
5603       unsigned filter = 0;
5604       for (auto& s : states) {
5605         if (s == "up") {
5606           filter |= OSDMap::DUMP_UP;
5607         } else if (s == "down") {
5608           filter |= OSDMap::DUMP_DOWN;
5609         } else if (s == "in") {
5610           filter |= OSDMap::DUMP_IN;
5611         } else if (s == "out") {
5612           filter |= OSDMap::DUMP_OUT;
5613         } else if (s == "destroyed") {
5614           filter |= OSDMap::DUMP_DESTROYED;
5615         } else {
5616           ss << "unrecognized state '" << s << "'";
5617           r = -EINVAL;
5618           goto reply;
5619         }
5620       }
5621       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5622           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5623         ss << "cannot specify both 'in' and 'out'";
5624         r = -EINVAL;
5625         goto reply;
5626       }
5627       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5628            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5629            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5630            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5631            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5632            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5633         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5634         r = -EINVAL;
5635         goto reply;
5636       }
5637       if (f) {
5638         f->open_object_section("tree");
5639         p->print_tree(f.get(), NULL, filter, bucket);
5640         f->close_section();
5641         f->flush(ds);
5642       } else {
5643         p->print_tree(NULL, &ds, filter, bucket);
5644       }
5645       rdata.append(ds);
5646     } else if (prefix == "osd getmap") {
5647       rdata.append(osdmap_bl);
5648       ss << "got osdmap epoch " << p->get_epoch();
5649     } else if (prefix == "osd getcrushmap") {
5650       p->crush->encode(rdata, mon.get_quorum_con_features());
5651       ss << p->get_crush_version();
5652     } else if (prefix == "osd ls-tree") {
5653       string bucket_name;
5654       cmd_getval(cmdmap, "name", bucket_name);
5655       set<int> osds;
5656       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5657       if (r == -ENOENT) {
5658         ss << "\"" << bucket_name << "\" does not exist";
5659         goto reply;
5660       } else if (r < 0) {
5661         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5662         goto reply;
5663       }
5664
5665       if (f) {
5666         f->open_array_section("osds");
5667         for (auto &i : osds) {
5668           if (osdmap.exists(i)) {
5669             f->dump_int("osd", i);
5670           }
5671         }
5672         f->close_section();
5673         f->flush(ds);
5674       } else {
5675         bool first = true;
5676         for (auto &i : osds) {
5677           if (osdmap.exists(i)) {
5678             if (!first)
5679               ds << "\n";
5680             first = false;
5681             ds << i;
5682           }
5683         }
5684       }
5685
5686       rdata.append(ds);
5687     }
5688   } else if (prefix == "osd getmaxosd") {
5689     if (f) {
5690       f->open_object_section("getmaxosd");
5691       f->dump_unsigned("epoch", osdmap.get_epoch());
5692       f->dump_int("max_osd", osdmap.get_max_osd());
5693       f->close_section();
5694       f->flush(rdata);
5695     } else {
5696       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5697       rdata.append(ds);
5698     }
5699   } else if (prefix == "osd utilization") {
5700     string out;
5701     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5702     if (f)
5703       f->flush(rdata);
5704     else
5705       rdata.append(out);
5706     r = 0;
5707     goto reply;
5708   } else if (prefix  == "osd find") {
5709     int64_t osd;
5710     if (!cmd_getval(cmdmap, "id", osd)) {
5711       ss << "unable to parse osd id value '"
5712          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5713       r = -EINVAL;
5714       goto reply;
5715     }
5716     if (!osdmap.exists(osd)) {
5717       ss << "osd." << osd << " does not exist";
5718       r = -ENOENT;
5719       goto reply;
5720     }
5721     string format;
5722     cmd_getval(cmdmap, "format", format);
5723     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5724     f->open_object_section("osd_location");
5725     f->dump_int("osd", osd);
5726     f->dump_object("addrs", osdmap.get_addrs(osd));
5727     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5728
5729     // try to identify host, pod/container name, etc.
5730     map<string,string> m;
5731     load_metadata(osd, m, nullptr);
5732     if (auto p = m.find("hostname"); p != m.end()) {
5733       f->dump_string("host", p->second);
5734     }
5735     for (auto& k : {
5736         "pod_name", "pod_namespace", // set by rook
5737         "container_name"             // set by cephadm, ceph-ansible
5738         }) {
5739       if (auto p = m.find(k); p != m.end()) {
5740         f->dump_string(k, p->second);
5741       }
5742     }
5743
5744     // crush is helpful too
5745     f->open_object_section("crush_location");
5746     map<string,string> loc = osdmap.crush->get_full_location(osd);
5747     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5748       f->dump_string(p->first.c_str(), p->second);
5749     f->close_section();
5750     f->close_section();
5751     f->flush(rdata);
5752   } else if (prefix == "osd metadata") {
5753     int64_t osd = -1;
5754     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5755         !cmd_getval(cmdmap, "id", osd)) {
5756       ss << "unable to parse osd id value '"
5757          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5758       r = -EINVAL;
5759       goto reply;
5760     }
5761     if (osd >= 0 && !osdmap.exists(osd)) {
5762       ss << "osd." << osd << " does not exist";
5763       r = -ENOENT;
5764       goto reply;
5765     }
5766     string format;
5767     cmd_getval(cmdmap, "format", format);
5768     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5769     if (osd >= 0) {
5770       f->open_object_section("osd_metadata");
5771       f->dump_unsigned("id", osd);
5772       r = dump_osd_metadata(osd, f.get(), &ss);
5773       if (r < 0)
5774         goto reply;
5775       f->close_section();
5776     } else {
5777       r = 0;
5778       f->open_array_section("osd_metadata");
5779       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5780         if (osdmap.exists(i)) {
5781           f->open_object_section("osd");
5782           f->dump_unsigned("id", i);
5783           r = dump_osd_metadata(i, f.get(), NULL);
5784           if (r == -EINVAL || r == -ENOENT) {
5785             // Drop error, continue to get other daemons' metadata
5786             dout(4) << "No metadata for osd." << i << dendl;
5787             r = 0;
5788           } else if (r < 0) {
5789             // Unexpected error
5790             goto reply;
5791           }
5792           f->close_section();
5793         }
5794       }
5795       f->close_section();
5796     }
5797     f->flush(rdata);
5798   } else if (prefix == "osd versions") {
5799     if (!f)
5800       f.reset(Formatter::create("json-pretty"));
5801     count_metadata("ceph_version", f.get());
5802     f->flush(rdata);
5803     r = 0;
5804   } else if (prefix == "osd count-metadata") {
5805     if (!f)
5806       f.reset(Formatter::create("json-pretty"));
5807     string field;
5808     cmd_getval(cmdmap, "property", field);
5809     count_metadata(field, f.get());
5810     f->flush(rdata);
5811     r = 0;
5812   } else if (prefix == "osd numa-status") {
5813     TextTable tbl;
5814     if (f) {
5815       f->open_array_section("osds");
5816     } else {
5817       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5818       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5819       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5820       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5821       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5822       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5823     }
5824     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5825       if (osdmap.exists(i)) {
5826         map<string,string> m;
5827         ostringstream err;
5828         if (load_metadata(i, m, &err) < 0) {
5829           continue;
5830         }
5831         string host;
5832         auto p = m.find("hostname");
5833         if (p != m.end()) {
5834           host = p->second;
5835         }
5836         if (f) {
5837           f->open_object_section("osd");
5838           f->dump_int("osd", i);
5839           f->dump_string("host", host);
5840           for (auto n : { "network_numa_node", "objectstore_numa_node",
5841                 "numa_node" }) {
5842             p = m.find(n);
5843             if (p != m.end()) {
5844               f->dump_int(n, atoi(p->second.c_str()));
5845             }
5846           }
5847           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5848             p = m.find(n);
5849             if (p != m.end()) {
5850               list<string> ls = get_str_list(p->second, ",");
5851               f->open_array_section(n);
5852               for (auto node : ls) {
5853                 f->dump_int("node", atoi(node.c_str()));
5854               }
5855               f->close_section();
5856             }
5857           }
5858           for (auto n : { "numa_node_cpus" }) {
5859             p = m.find(n);
5860             if (p != m.end()) {
5861               dump_cpu_list(f.get(), n, p->second);
5862             }
5863           }
5864           f->close_section();
5865         } else {
5866           tbl << i;
5867           tbl << host;
5868           p = m.find("network_numa_nodes");
5869           if (p != m.end()) {
5870             tbl << p->second;
5871           } else {
5872             tbl << "-";
5873           }
5874           p = m.find("objectstore_numa_nodes");
5875           if (p != m.end()) {
5876             tbl << p->second;
5877           } else {
5878             tbl << "-";
5879           }
5880           p = m.find("numa_node");
5881           auto q = m.find("numa_node_cpus");
5882           if (p != m.end() && q != m.end()) {
5883             tbl << p->second;
5884             tbl << q->second;
5885           } else {
5886             tbl << "-";
5887             tbl << "-";
5888           }
5889           tbl << TextTable::endrow;
5890         }
5891       }
5892     }
5893     if (f) {
5894       f->close_section();
5895       f->flush(rdata);
5896     } else {
5897       rdata.append(stringify(tbl));
5898     }
5899   } else if (prefix == "osd map") {
5900     string poolstr, objstr, namespacestr;
5901     cmd_getval(cmdmap, "pool", poolstr);
5902     cmd_getval(cmdmap, "object", objstr);
5903     cmd_getval(cmdmap, "nspace", namespacestr);
5904
5905     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5906     if (pool < 0) {
5907       ss << "pool " << poolstr << " does not exist";
5908       r = -ENOENT;
5909       goto reply;
5910     }
5911     object_locator_t oloc(pool, namespacestr);
5912     object_t oid(objstr);
5913     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5914     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5915     vector<int> up, acting;
5916     int up_p, acting_p;
5917     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5918
5919     string fullobjname;
5920     if (!namespacestr.empty())
5921       fullobjname = namespacestr + string("/") + oid.name;
5922     else
5923       fullobjname = oid.name;
5924     if (f) {
5925       f->open_object_section("osd_map");
5926       f->dump_unsigned("epoch", osdmap.get_epoch());
5927       f->dump_string("pool", poolstr);
5928       f->dump_int("pool_id", pool);
5929       f->dump_stream("objname") << fullobjname;
5930       f->dump_stream("raw_pgid") << pgid;
5931       f->dump_stream("pgid") << mpgid;
5932       f->open_array_section("up");
5933       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5934         f->dump_int("osd", *p);
5935       f->close_section();
5936       f->dump_int("up_primary", up_p);
5937       f->open_array_section("acting");
5938       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5939         f->dump_int("osd", *p);
5940       f->close_section();
5941       f->dump_int("acting_primary", acting_p);
5942       f->close_section(); // osd_map
5943       f->flush(rdata);
5944     } else {
5945       ds << "osdmap e" << osdmap.get_epoch()
5946         << " pool '" << poolstr << "' (" << pool << ")"
5947         << " object '" << fullobjname << "' ->"
5948         << " pg " << pgid << " (" << mpgid << ")"
5949         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5950         << pg_vector_string(acting) << ", p" << acting_p << ")";
5951       rdata.append(ds);
5952     }
5953
5954   } else if (prefix == "pg map") {
5955     pg_t pgid;
5956     string pgidstr;
5957     cmd_getval(cmdmap, "pgid", pgidstr);
5958     if (!pgid.parse(pgidstr.c_str())) {
5959       ss << "invalid pgid '" << pgidstr << "'";
5960       r = -EINVAL;
5961       goto reply;
5962     }
5963     vector<int> up, acting;
5964     if (!osdmap.have_pg_pool(pgid.pool())) {
5965       ss << "pg '" << pgidstr << "' does not exist";
5966       r = -ENOENT;
5967       goto reply;
5968     }
5969     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5970     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5971     if (f) {
5972       f->open_object_section("pg_map");
5973       f->dump_unsigned("epoch", osdmap.get_epoch());
5974       f->dump_stream("raw_pgid") << pgid;
5975       f->dump_stream("pgid") << mpgid;
5976       f->open_array_section("up");
5977       for (auto osd : up) {
5978         f->dump_int("up_osd", osd);
5979       }
5980       f->close_section();
5981       f->open_array_section("acting");
5982       for (auto osd : acting) {
5983         f->dump_int("acting_osd", osd);
5984       }
5985       f->close_section();
5986       f->close_section();
5987       f->flush(rdata);
5988     } else {
5989       ds << "osdmap e" << osdmap.get_epoch()
5990          << " pg " << pgid << " (" << mpgid << ")"
5991          << " -> up " << up << " acting " << acting;
5992       rdata.append(ds);
5993     }
5994     goto reply;
5995
5996   } else if (prefix == "osd lspools") {
5997     if (f)
5998       f->open_array_section("pools");
5999     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
6000          p != osdmap.pools.end();
6001          ++p) {
6002       if (f) {
6003         f->open_object_section("pool");
6004         f->dump_int("poolnum", p->first);
6005         f->dump_string("poolname", osdmap.pool_name[p->first]);
6006         f->close_section();
6007       } else {
6008         ds << p->first << ' ' << osdmap.pool_name[p->first];
6009         if (next(p) != osdmap.pools.end()) {
6010           ds << '\n';
6011         }
6012       }
6013     }
6014     if (f) {
6015       f->close_section();
6016       f->flush(ds);
6017     }
6018     rdata.append(ds);
6019   } else if (prefix == "osd blocklist ls" ||
6020              prefix == "osd blacklist ls") {
6021     if (f)
6022       f->open_array_section("blocklist");
6023
6024     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
6025          p != osdmap.blocklist.end();
6026          ++p) {
6027       if (f) {
6028         f->open_object_section("entry");
6029         f->dump_string("addr", p->first.get_legacy_str());
6030         f->dump_stream("until") << p->second;
6031         f->close_section();
6032       } else {
6033         stringstream ss;
6034         string s;
6035         ss << p->first << " " << p->second;
6036         getline(ss, s);
6037         s += "\n";
6038         rdata.append(s);
6039       }
6040     }
6041     if (f) {
6042       f->close_section();
6043       f->flush(rdata);
6044     }
6045     if (f)
6046       f->open_array_section("range_blocklist");
6047
6048     for (auto p = osdmap.range_blocklist.begin();
6049          p != osdmap.range_blocklist.end();
6050          ++p) {
6051       if (f) {
6052         f->open_object_section("entry");
6053         f->dump_string("range", p->first.get_legacy_str());
6054         f->dump_stream("until") << p->second;
6055         f->close_section();
6056       } else {
6057         stringstream ss;
6058         string s;
6059         ss << p->first << " " << p->second;
6060         getline(ss, s);
6061         s += "\n";
6062         rdata.append(s);
6063       }
6064     }
6065     if (f) {
6066       f->close_section();
6067       f->flush(rdata);
6068     }
6069     ss << "listed " << osdmap.blocklist.size() + osdmap.range_blocklist.size() << " entries";
6070
6071   } else if (prefix == "osd pool ls") {
6072     string detail;
6073     cmd_getval(cmdmap, "detail", detail);
6074     if (!f && detail == "detail") {
6075       ostringstream ss;
6076       osdmap.print_pools(ss);
6077       rdata.append(ss.str());
6078     } else {
6079       if (f)
6080         f->open_array_section("pools");
6081       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6082            it != osdmap.get_pools().end();
6083            ++it) {
6084         if (f) {
6085           if (detail == "detail") {
6086             f->open_object_section("pool");
6087             f->dump_int("pool_id", it->first);
6088             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6089             it->second.dump(f.get());
6090             f->close_section();
6091           } else {
6092             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6093           }
6094         } else {
6095           rdata.append(osdmap.get_pool_name(it->first) + "\n");
6096         }
6097       }
6098       if (f) {
6099         f->close_section();
6100         f->flush(rdata);
6101       }
6102     }
6103
6104   } else if (prefix == "osd crush get-tunable") {
6105     string tunable;
6106     cmd_getval(cmdmap, "tunable", tunable);
6107     ostringstream rss;
6108     if (f)
6109       f->open_object_section("tunable");
6110     if (tunable == "straw_calc_version") {
6111       if (f)
6112         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6113       else
6114         rss << osdmap.crush->get_straw_calc_version() << "\n";
6115     } else {
6116       r = -EINVAL;
6117       goto reply;
6118     }
6119     if (f) {
6120       f->close_section();
6121       f->flush(rdata);
6122     } else {
6123       rdata.append(rss.str());
6124     }
6125     r = 0;
6126
6127   } else if (prefix == "osd pool get") {
6128     string poolstr;
6129     cmd_getval(cmdmap, "pool", poolstr);
6130     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6131     if (pool < 0) {
6132       ss << "unrecognized pool '" << poolstr << "'";
6133       r = -ENOENT;
6134       goto reply;
6135     }
6136
6137     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6138     string var;
6139     cmd_getval(cmdmap, "var", var);
6140
6141     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6142     const choices_map_t ALL_CHOICES = {
6143       {"size", SIZE},
6144       {"min_size", MIN_SIZE},
6145       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6146       {"crush_rule", CRUSH_RULE},
6147       {"hashpspool", HASHPSPOOL},
6148       {"eio", POOL_EIO},
6149       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6150       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6151       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6152       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6153       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6154       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6155       {"use_gmt_hitset", USE_GMT_HITSET},
6156       {"target_max_objects", TARGET_MAX_OBJECTS},
6157       {"target_max_bytes", TARGET_MAX_BYTES},
6158       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6159       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6160       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6161       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6162       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6163       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6164       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6165       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6166       {"fast_read", FAST_READ},
6167       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6168       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6169       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6170       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6171       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6172       {"recovery_priority", RECOVERY_PRIORITY},
6173       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6174       {"scrub_priority", SCRUB_PRIORITY},
6175       {"compression_mode", COMPRESSION_MODE},
6176       {"compression_algorithm", COMPRESSION_ALGORITHM},
6177       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6178       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6179       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6180       {"csum_type", CSUM_TYPE},
6181       {"csum_max_block", CSUM_MAX_BLOCK},
6182       {"csum_min_block", CSUM_MIN_BLOCK},
6183       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6184       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6185       {"pg_num_min", PG_NUM_MIN},
6186       {"pg_num_max", PG_NUM_MAX},
6187       {"target_size_bytes", TARGET_SIZE_BYTES},
6188       {"target_size_ratio", TARGET_SIZE_RATIO},
6189       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6190       {"dedup_tier", DEDUP_TIER},
6191       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6192       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6193       {"bulk", BULK}
6194     };
6195
6196     typedef std::set<osd_pool_get_choices> choices_set_t;
6197
6198     const choices_set_t ONLY_TIER_CHOICES = {
6199       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6200       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6201       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6202       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6203       MIN_READ_RECENCY_FOR_PROMOTE,
6204       MIN_WRITE_RECENCY_FOR_PROMOTE,
6205       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6206     };
6207     const choices_set_t ONLY_ERASURE_CHOICES = {
6208       EC_OVERWRITES, ERASURE_CODE_PROFILE
6209     };
6210
6211     choices_set_t selected_choices;
6212     if (var == "all") {
6213       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6214           it != ALL_CHOICES.end(); ++it) {
6215         selected_choices.insert(it->second);
6216       }
6217
6218       if(!p->is_tier()) {
6219         selected_choices = subtract_second_from_first(selected_choices,
6220                                                       ONLY_TIER_CHOICES);
6221       }
6222
6223       if(!p->is_erasure()) {
6224         selected_choices = subtract_second_from_first(selected_choices,
6225                                                       ONLY_ERASURE_CHOICES);
6226       }
6227     } else /* var != "all" */  {
6228       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6229       if (found == ALL_CHOICES.end()) {
6230         ss << "pool '" << poolstr
6231                << "': invalid variable: '" << var << "'";
6232         r = -EINVAL;
6233         goto reply;
6234       }
6235
6236       osd_pool_get_choices selected = found->second;
6237
6238       if (!p->is_tier() &&
6239           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6240         ss << "pool '" << poolstr
6241            << "' is not a tier pool: variable not applicable";
6242         r = -EACCES;
6243         goto reply;
6244       }
6245
6246       if (!p->is_erasure() &&
6247           ONLY_ERASURE_CHOICES.find(selected)
6248           != ONLY_ERASURE_CHOICES.end()) {
6249         ss << "pool '" << poolstr
6250            << "' is not a erasure pool: variable not applicable";
6251         r = -EACCES;
6252         goto reply;
6253       }
6254
6255       if (pool_opts_t::is_opt_name(var) &&
6256           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6257         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6258         r = -ENOENT;
6259         goto reply;
6260       }
6261
6262       selected_choices.insert(selected);
6263     }
6264
6265     if (f) {
6266       f->open_object_section("pool");
6267       f->dump_string("pool", poolstr);
6268       f->dump_int("pool_id", pool);
6269       for(choices_set_t::const_iterator it = selected_choices.begin();
6270           it != selected_choices.end(); ++it) {
6271         choices_map_t::const_iterator i;
6272         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6273           if (i->second == *it) {
6274             break;
6275           }
6276         }
6277         ceph_assert(i != ALL_CHOICES.end());
6278         switch(*it) {
6279           case PG_NUM:
6280             f->dump_int("pg_num", p->get_pg_num());
6281             break;
6282           case PGP_NUM:
6283             f->dump_int("pgp_num", p->get_pgp_num());
6284             break;
6285           case SIZE:
6286             f->dump_int("size", p->get_size());
6287             break;
6288           case MIN_SIZE:
6289             f->dump_int("min_size", p->get_min_size());
6290             break;
6291           case CRUSH_RULE:
6292             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6293               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6294                                p->get_crush_rule()));
6295             } else {
6296               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6297             }
6298             break;
6299           case EC_OVERWRITES:
6300             f->dump_bool("allow_ec_overwrites",
6301                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6302             break;
6303           case PG_AUTOSCALE_MODE:
6304             f->dump_string("pg_autoscale_mode",
6305                            pg_pool_t::get_pg_autoscale_mode_name(
6306                              p->pg_autoscale_mode));
6307             break;
6308           case HASHPSPOOL:
6309           case POOL_EIO:
6310           case NODELETE:
6311           case BULK:
6312           case NOPGCHANGE:
6313           case NOSIZECHANGE:
6314           case WRITE_FADVISE_DONTNEED:
6315           case NOSCRUB:
6316           case NODEEP_SCRUB:
6317             f->dump_bool(i->first.c_str(),
6318                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6319             break;
6320           case HIT_SET_PERIOD:
6321             f->dump_int("hit_set_period", p->hit_set_period);
6322             break;
6323           case HIT_SET_COUNT:
6324             f->dump_int("hit_set_count", p->hit_set_count);
6325             break;
6326           case HIT_SET_TYPE:
6327             f->dump_string("hit_set_type",
6328                            HitSet::get_type_name(p->hit_set_params.get_type()));
6329             break;
6330           case HIT_SET_FPP:
6331             {
6332               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6333                 BloomHitSet::Params *bloomp =
6334                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6335                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6336               } else if(var != "all") {
6337                 f->close_section();
6338                 ss << "hit set is not of type Bloom; " <<
6339                   "invalid to get a false positive rate!";
6340                 r = -EINVAL;
6341                 goto reply;
6342               }
6343             }
6344             break;
6345           case USE_GMT_HITSET:
6346             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6347             break;
6348           case TARGET_MAX_OBJECTS:
6349             f->dump_unsigned("target_max_objects", p->target_max_objects);
6350             break;
6351           case TARGET_MAX_BYTES:
6352             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6353             break;
6354           case CACHE_TARGET_DIRTY_RATIO:
6355             f->dump_unsigned("cache_target_dirty_ratio_micro",
6356                              p->cache_target_dirty_ratio_micro);
6357             f->dump_float("cache_target_dirty_ratio",
6358                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6359             break;
6360           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6361             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6362                              p->cache_target_dirty_high_ratio_micro);
6363             f->dump_float("cache_target_dirty_high_ratio",
6364                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6365             break;
6366           case CACHE_TARGET_FULL_RATIO:
6367             f->dump_unsigned("cache_target_full_ratio_micro",
6368                              p->cache_target_full_ratio_micro);
6369             f->dump_float("cache_target_full_ratio",
6370                           ((float)p->cache_target_full_ratio_micro/1000000));
6371             break;
6372           case CACHE_MIN_FLUSH_AGE:
6373             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6374             break;
6375           case CACHE_MIN_EVICT_AGE:
6376             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6377             break;
6378           case ERASURE_CODE_PROFILE:
6379             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6380             break;
6381           case MIN_READ_RECENCY_FOR_PROMOTE:
6382             f->dump_int("min_read_recency_for_promote",
6383                         p->min_read_recency_for_promote);
6384             break;
6385           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6386             f->dump_int("min_write_recency_for_promote",
6387                         p->min_write_recency_for_promote);
6388             break;
6389           case FAST_READ:
6390             f->dump_int("fast_read", p->fast_read);
6391             break;
6392           case HIT_SET_GRADE_DECAY_RATE:
6393             f->dump_int("hit_set_grade_decay_rate",
6394                         p->hit_set_grade_decay_rate);
6395             break;
6396           case HIT_SET_SEARCH_LAST_N:
6397             f->dump_int("hit_set_search_last_n",
6398                         p->hit_set_search_last_n);
6399             break;
6400           case SCRUB_MIN_INTERVAL:
6401           case SCRUB_MAX_INTERVAL:
6402           case DEEP_SCRUB_INTERVAL:
6403           case RECOVERY_PRIORITY:
6404           case RECOVERY_OP_PRIORITY:
6405           case SCRUB_PRIORITY:
6406           case COMPRESSION_MODE:
6407           case COMPRESSION_ALGORITHM:
6408           case COMPRESSION_REQUIRED_RATIO:
6409           case COMPRESSION_MAX_BLOB_SIZE:
6410           case COMPRESSION_MIN_BLOB_SIZE:
6411           case CSUM_TYPE:
6412           case CSUM_MAX_BLOCK:
6413           case CSUM_MIN_BLOCK:
6414           case FINGERPRINT_ALGORITHM:
6415           case PG_NUM_MIN:
6416           case PG_NUM_MAX:
6417           case TARGET_SIZE_BYTES:
6418           case TARGET_SIZE_RATIO:
6419           case PG_AUTOSCALE_BIAS:
6420           case DEDUP_TIER:
6421           case DEDUP_CHUNK_ALGORITHM:
6422           case DEDUP_CDC_CHUNK_SIZE:
6423             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6424             if (p->opts.is_set(key)) {
6425               if(*it == CSUM_TYPE) {
6426                 int64_t val;
6427                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6428                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6429               } else {
6430                 p->opts.dump(i->first, f.get());
6431               }
6432             }
6433             break;
6434         }
6435       }
6436       f->close_section();
6437       f->flush(rdata);
6438     } else /* !f */ {
6439       for(choices_set_t::const_iterator it = selected_choices.begin();
6440           it != selected_choices.end(); ++it) {
6441         choices_map_t::const_iterator i;
6442         switch(*it) {
6443           case PG_NUM:
6444             ss << "pg_num: " << p->get_pg_num() << "\n";
6445             break;
6446           case PGP_NUM:
6447             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6448             break;
6449           case SIZE:
6450             ss << "size: " << p->get_size() << "\n";
6451             break;
6452           case MIN_SIZE:
6453             ss << "min_size: " << p->get_min_size() << "\n";
6454             break;
6455           case CRUSH_RULE:
6456             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6457               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6458                 p->get_crush_rule()) << "\n";
6459             } else {
6460               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6461             }
6462             break;
6463           case PG_AUTOSCALE_MODE:
6464             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6465               p->pg_autoscale_mode) <<"\n";
6466             break;
6467           case HIT_SET_PERIOD:
6468             ss << "hit_set_period: " << p->hit_set_period << "\n";
6469             break;
6470           case HIT_SET_COUNT:
6471             ss << "hit_set_count: " << p->hit_set_count << "\n";
6472             break;
6473           case HIT_SET_TYPE:
6474             ss << "hit_set_type: " <<
6475               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6476             break;
6477           case HIT_SET_FPP:
6478             {
6479               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6480                 BloomHitSet::Params *bloomp =
6481                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6482                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6483               } else if(var != "all") {
6484                 ss << "hit set is not of type Bloom; " <<
6485                   "invalid to get a false positive rate!";
6486                 r = -EINVAL;
6487                 goto reply;
6488               }
6489             }
6490             break;
6491           case USE_GMT_HITSET:
6492             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6493             break;
6494           case TARGET_MAX_OBJECTS:
6495             ss << "target_max_objects: " << p->target_max_objects << "\n";
6496             break;
6497           case TARGET_MAX_BYTES:
6498             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6499             break;
6500           case CACHE_TARGET_DIRTY_RATIO:
6501             ss << "cache_target_dirty_ratio: "
6502                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6503             break;
6504           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6505             ss << "cache_target_dirty_high_ratio: "
6506                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6507             break;
6508           case CACHE_TARGET_FULL_RATIO:
6509             ss << "cache_target_full_ratio: "
6510                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6511             break;
6512           case CACHE_MIN_FLUSH_AGE:
6513             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6514             break;
6515           case CACHE_MIN_EVICT_AGE:
6516             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6517             break;
6518           case ERASURE_CODE_PROFILE:
6519             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6520             break;
6521           case MIN_READ_RECENCY_FOR_PROMOTE:
6522             ss << "min_read_recency_for_promote: " <<
6523               p->min_read_recency_for_promote << "\n";
6524             break;
6525           case HIT_SET_GRADE_DECAY_RATE:
6526             ss << "hit_set_grade_decay_rate: " <<
6527               p->hit_set_grade_decay_rate << "\n";
6528             break;
6529           case HIT_SET_SEARCH_LAST_N:
6530             ss << "hit_set_search_last_n: " <<
6531               p->hit_set_search_last_n << "\n";
6532             break;
6533           case EC_OVERWRITES:
6534             ss << "allow_ec_overwrites: " <<
6535               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6536               "\n";
6537             break;
6538           case HASHPSPOOL:
6539           case POOL_EIO:
6540           case NODELETE:
6541           case BULK:
6542           case NOPGCHANGE:
6543           case NOSIZECHANGE:
6544           case WRITE_FADVISE_DONTNEED:
6545           case NOSCRUB:
6546           case NODEEP_SCRUB:
6547             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6548               if (i->second == *it)
6549                 break;
6550             }
6551             ceph_assert(i != ALL_CHOICES.end());
6552             ss << i->first << ": " <<
6553               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6554                "true" : "false") << "\n";
6555             break;
6556           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6557             ss << "min_write_recency_for_promote: " <<
6558               p->min_write_recency_for_promote << "\n";
6559             break;
6560           case FAST_READ:
6561             ss << "fast_read: " << p->fast_read << "\n";
6562             break;
6563           case SCRUB_MIN_INTERVAL:
6564           case SCRUB_MAX_INTERVAL:
6565           case DEEP_SCRUB_INTERVAL:
6566           case RECOVERY_PRIORITY:
6567           case RECOVERY_OP_PRIORITY:
6568           case SCRUB_PRIORITY:
6569           case COMPRESSION_MODE:
6570           case COMPRESSION_ALGORITHM:
6571           case COMPRESSION_REQUIRED_RATIO:
6572           case COMPRESSION_MAX_BLOB_SIZE:
6573           case COMPRESSION_MIN_BLOB_SIZE:
6574           case CSUM_TYPE:
6575           case CSUM_MAX_BLOCK:
6576           case CSUM_MIN_BLOCK:
6577           case FINGERPRINT_ALGORITHM:
6578           case PG_NUM_MIN:
6579           case PG_NUM_MAX:
6580           case TARGET_SIZE_BYTES:
6581           case TARGET_SIZE_RATIO:
6582           case PG_AUTOSCALE_BIAS:
6583           case DEDUP_TIER:
6584           case DEDUP_CHUNK_ALGORITHM:
6585           case DEDUP_CDC_CHUNK_SIZE:
6586             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6587               if (i->second == *it)
6588                 break;
6589             }
6590             ceph_assert(i != ALL_CHOICES.end());
6591             {
6592               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6593               if (p->opts.is_set(key)) {
6594                 if(key == pool_opts_t::CSUM_TYPE) {
6595                   int64_t val;
6596                   p->opts.get(key, &val);
6597                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6598                 } else {
6599                   ss << i->first << ": " << p->opts.get(key) << "\n";
6600                 }
6601               }
6602             }
6603             break;
6604         }
6605         rdata.append(ss.str());
6606         ss.str("");
6607       }
6608     }
6609     r = 0;
6610   } else if (prefix == "osd pool get-quota") {
6611     string pool_name;
6612     cmd_getval(cmdmap, "pool", pool_name);
6613
6614     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6615     if (poolid < 0) {
6616       ceph_assert(poolid == -ENOENT);
6617       ss << "unrecognized pool '" << pool_name << "'";
6618       r = -ENOENT;
6619       goto reply;
6620     }
6621     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6622     const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6623     if (!pstat) {
6624       ss << "no stats for pool '" << pool_name << "'";
6625       r = -ENOENT;
6626       goto reply;
6627     }
6628     const object_stat_sum_t& sum = pstat->stats.sum;
6629     if (f) {
6630       f->open_object_section("pool_quotas");
6631       f->dump_string("pool_name", pool_name);
6632       f->dump_unsigned("pool_id", poolid);
6633       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6634       f->dump_int("current_num_objects", sum.num_objects);
6635       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6636       f->dump_int("current_num_bytes", sum.num_bytes);
6637       f->close_section();
6638       f->flush(rdata);
6639     } else {
6640       stringstream rs;
6641       rs << "quotas for pool '" << pool_name << "':\n"
6642          << "  max objects: ";
6643       if (p->quota_max_objects == 0)
6644         rs << "N/A";
6645       else {
6646         rs << si_u_t(p->quota_max_objects) << " objects";
6647         rs << "  (current num objects: " << sum.num_objects << " objects)";
6648       }
6649       rs << "\n"
6650          << "  max bytes  : ";
6651       if (p->quota_max_bytes == 0)
6652         rs << "N/A";
6653       else {
6654         rs << byte_u_t(p->quota_max_bytes);
6655         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6656       }
6657       rdata.append(rs.str());
6658     }
6659     rdata.append("\n");
6660     r = 0;
6661   } else if (prefix == "osd crush rule list" ||
6662              prefix == "osd crush rule ls") {
6663     if (f) {
6664       f->open_array_section("rules");
6665       osdmap.crush->list_rules(f.get());
6666       f->close_section();
6667       f->flush(rdata);
6668     } else {
6669       ostringstream ss;
6670       osdmap.crush->list_rules(&ss);
6671       rdata.append(ss.str());
6672     }
6673   } else if (prefix == "osd crush rule ls-by-class") {
6674     string class_name;
6675     cmd_getval(cmdmap, "class", class_name);
6676     if (class_name.empty()) {
6677       ss << "no class specified";
6678       r = -EINVAL;
6679       goto reply;
6680     }
6681     set<int> rules;
6682     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6683     if (r < 0) {
6684       ss << "failed to get rules by class '" << class_name << "'";
6685       goto reply;
6686     }
6687     if (f) {
6688       f->open_array_section("rules");
6689       for (auto &rule: rules) {
6690         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6691       }
6692       f->close_section();
6693       f->flush(rdata);
6694     } else {
6695       ostringstream rs;
6696       for (auto &rule: rules) {
6697         rs << osdmap.crush->get_rule_name(rule) << "\n";
6698       }
6699       rdata.append(rs.str());
6700     }
6701   } else if (prefix == "osd crush rule dump") {
6702     string name;
6703     cmd_getval(cmdmap, "name", name);
6704     string format;
6705     cmd_getval(cmdmap, "format", format);
6706     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6707     if (name == "") {
6708       f->open_array_section("rules");
6709       osdmap.crush->dump_rules(f.get());
6710       f->close_section();
6711     } else {
6712       int ruleno = osdmap.crush->get_rule_id(name);
6713       if (ruleno < 0) {
6714         ss << "unknown crush rule '" << name << "'";
6715         r = ruleno;
6716         goto reply;
6717       }
6718       osdmap.crush->dump_rule(ruleno, f.get());
6719     }
6720     ostringstream rs;
6721     f->flush(rs);
6722     rs << "\n";
6723     rdata.append(rs.str());
6724   } else if (prefix == "osd crush dump") {
6725     string format;
6726     cmd_getval(cmdmap, "format", format);
6727     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6728     f->open_object_section("crush_map");
6729     osdmap.crush->dump(f.get());
6730     f->close_section();
6731     ostringstream rs;
6732     f->flush(rs);
6733     rs << "\n";
6734     rdata.append(rs.str());
6735   } else if (prefix == "osd crush show-tunables") {
6736     string format;
6737     cmd_getval(cmdmap, "format", format);
6738     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6739     f->open_object_section("crush_map_tunables");
6740     osdmap.crush->dump_tunables(f.get());
6741     f->close_section();
6742     ostringstream rs;
6743     f->flush(rs);
6744     rs << "\n";
6745     rdata.append(rs.str());
6746   } else if (prefix == "osd crush tree") {
6747     bool show_shadow = false;
6748     if (!cmd_getval_compat_cephbool(cmdmap, "show_shadow", show_shadow)) {
6749       std::string shadow;
6750       if (cmd_getval(cmdmap, "shadow", shadow) &&
6751           shadow == "--show-shadow") {
6752         show_shadow = true;
6753       }
6754     }
6755     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6756     if (f) {
6757       f->open_object_section("crush_tree");
6758       osdmap.crush->dump_tree(nullptr,
6759                               f.get(),
6760                               osdmap.get_pool_names(),
6761                               show_shadow);
6762       f->close_section();
6763       f->flush(rdata);
6764     } else {
6765       ostringstream ss;
6766       osdmap.crush->dump_tree(&ss,
6767                               nullptr,
6768                               osdmap.get_pool_names(),
6769                               show_shadow);
6770       rdata.append(ss.str());
6771     }
6772   } else if (prefix == "osd crush ls") {
6773     string name;
6774     if (!cmd_getval(cmdmap, "node", name)) {
6775       ss << "no node specified";
6776       r = -EINVAL;
6777       goto reply;
6778     }
6779     if (!osdmap.crush->name_exists(name)) {
6780       ss << "node '" << name << "' does not exist";
6781       r = -ENOENT;
6782       goto reply;
6783     }
6784     int id = osdmap.crush->get_item_id(name);
6785     list<int> result;
6786     if (id >= 0) {
6787       result.push_back(id);
6788     } else {
6789       int num = osdmap.crush->get_bucket_size(id);
6790       for (int i = 0; i < num; ++i) {
6791         result.push_back(osdmap.crush->get_bucket_item(id, i));
6792       }
6793     }
6794     if (f) {
6795       f->open_array_section("items");
6796       for (auto i : result) {
6797         f->dump_string("item", osdmap.crush->get_item_name(i));
6798       }
6799       f->close_section();
6800       f->flush(rdata);
6801     } else {
6802       ostringstream ss;
6803       for (auto i : result) {
6804         ss << osdmap.crush->get_item_name(i) << "\n";
6805       }
6806       rdata.append(ss.str());
6807     }
6808     r = 0;
6809   } else if (prefix == "osd crush class ls") {
6810     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6811     f->open_array_section("crush_classes");
6812     for (auto i : osdmap.crush->class_name)
6813       f->dump_string("class", i.second);
6814     f->close_section();
6815     f->flush(rdata);
6816   } else if (prefix == "osd crush class ls-osd") {
6817     string name;
6818     cmd_getval(cmdmap, "class", name);
6819     set<int> osds;
6820     osdmap.crush->get_devices_by_class(name, &osds);
6821     if (f) {
6822       f->open_array_section("osds");
6823       for (auto &osd: osds)
6824         f->dump_int("osd", osd);
6825       f->close_section();
6826       f->flush(rdata);
6827     } else {
6828       bool first = true;
6829       for (auto &osd : osds) {
6830         if (!first)
6831           ds << "\n";
6832         first = false;
6833         ds << osd;
6834       }
6835       rdata.append(ds);
6836     }
6837   } else if (prefix == "osd crush get-device-class") {
6838     vector<string> idvec;
6839     cmd_getval(cmdmap, "ids", idvec);
6840     map<int, string> class_by_osd;
6841     for (auto& id : idvec) {
6842       ostringstream ts;
6843       long osd = parse_osd_id(id.c_str(), &ts);
6844       if (osd < 0) {
6845         ss << "unable to parse osd id:'" << id << "'";
6846         r = -EINVAL;
6847         goto reply;
6848       }
6849       auto device_class = osdmap.crush->get_item_class(osd);
6850       if (device_class)
6851         class_by_osd[osd] = device_class;
6852       else
6853         class_by_osd[osd] = ""; // no class
6854     }
6855     if (f) {
6856       f->open_array_section("osd_device_classes");
6857       for (auto& i : class_by_osd) {
6858         f->open_object_section("osd_device_class");
6859         f->dump_int("osd", i.first);
6860         f->dump_string("device_class", i.second);
6861         f->close_section();
6862       }
6863       f->close_section();
6864       f->flush(rdata);
6865     } else {
6866       if (class_by_osd.size() == 1) {
6867         // for single input, make a clean output
6868         ds << class_by_osd.begin()->second;
6869       } else {
6870         // note that we do not group osds by class here
6871         for (auto it = class_by_osd.begin();
6872              it != class_by_osd.end();
6873              it++) {
6874           ds << "osd." << it->first << ' ' << it->second;
6875           if (next(it) != class_by_osd.end())
6876             ds << '\n';
6877         }
6878       }
6879       rdata.append(ds);
6880     }
6881   } else if (prefix == "osd erasure-code-profile ls") {
6882     const auto &profiles = osdmap.get_erasure_code_profiles();
6883     if (f)
6884       f->open_array_section("erasure-code-profiles");
6885     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6886       if (f)
6887         f->dump_string("profile", i->first.c_str());
6888       else
6889         rdata.append(i->first + "\n");
6890     }
6891     if (f) {
6892       f->close_section();
6893       ostringstream rs;
6894       f->flush(rs);
6895       rs << "\n";
6896       rdata.append(rs.str());
6897     }
6898   } else if (prefix == "osd crush weight-set ls") {
6899     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6900     if (f) {
6901       f->open_array_section("weight_sets");
6902       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6903         f->dump_string("pool", "(compat)");
6904       }
6905       for (auto& i : osdmap.crush->choose_args) {
6906         if (i.first >= 0) {
6907           f->dump_string("pool", osdmap.get_pool_name(i.first));
6908         }
6909       }
6910       f->close_section();
6911       f->flush(rdata);
6912     } else {
6913       ostringstream rs;
6914       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6915         rs << "(compat)\n";
6916       }
6917       for (auto& i : osdmap.crush->choose_args) {
6918         if (i.first >= 0) {
6919           rs << osdmap.get_pool_name(i.first) << "\n";
6920         }
6921       }
6922       rdata.append(rs.str());
6923     }
6924   } else if (prefix == "osd crush weight-set dump") {
6925     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6926                                                      "json-pretty"));
6927     osdmap.crush->dump_choose_args(f.get());
6928     f->flush(rdata);
6929   } else if (prefix == "osd erasure-code-profile get") {
6930     string name;
6931     cmd_getval(cmdmap, "name", name);
6932     if (!osdmap.has_erasure_code_profile(name)) {
6933       ss << "unknown erasure code profile '" << name << "'";
6934       r = -ENOENT;
6935       goto reply;
6936     }
6937     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6938     if (f)
6939       f->open_object_section("profile");
6940     for (map<string,string>::const_iterator i = profile.begin();
6941          i != profile.end();
6942          ++i) {
6943       if (f)
6944         f->dump_string(i->first.c_str(), i->second.c_str());
6945       else
6946         rdata.append(i->first + "=" + i->second + "\n");
6947     }
6948     if (f) {
6949       f->close_section();
6950       ostringstream rs;
6951       f->flush(rs);
6952       rs << "\n";
6953       rdata.append(rs.str());
6954     }
6955   } else if (prefix == "osd pool application get") {
6956     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6957                                                      "json-pretty"));
6958     string pool_name;
6959     cmd_getval(cmdmap, "pool", pool_name);
6960     string app;
6961     cmd_getval(cmdmap, "app", app);
6962     string key;
6963     cmd_getval(cmdmap, "key", key);
6964
6965     if (pool_name.empty()) {
6966       // all
6967       f->open_object_section("pools");
6968       for (const auto &pool : osdmap.pools) {
6969         std::string name("<unknown>");
6970         const auto &pni = osdmap.pool_name.find(pool.first);
6971         if (pni != osdmap.pool_name.end())
6972           name = pni->second;
6973         f->open_object_section(name.c_str());
6974         for (auto &app_pair : pool.second.application_metadata) {
6975           f->open_object_section(app_pair.first.c_str());
6976           for (auto &kv_pair : app_pair.second) {
6977             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6978           }
6979           f->close_section();
6980         }
6981         f->close_section(); // name
6982       }
6983       f->close_section(); // pools
6984       f->flush(rdata);
6985     } else {
6986       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6987       if (pool < 0) {
6988         ss << "unrecognized pool '" << pool_name << "'";
6989         r = -ENOENT;
6990         goto reply;
6991       }
6992       auto p = osdmap.get_pg_pool(pool);
6993       // filter by pool
6994       if (app.empty()) {
6995         f->open_object_section(pool_name.c_str());
6996         for (auto &app_pair : p->application_metadata) {
6997           f->open_object_section(app_pair.first.c_str());
6998           for (auto &kv_pair : app_pair.second) {
6999             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7000           }
7001           f->close_section(); // application
7002         }
7003         f->close_section(); // pool_name
7004         f->flush(rdata);
7005         goto reply;
7006       }
7007
7008       auto app_it = p->application_metadata.find(app);
7009       if (app_it == p->application_metadata.end()) {
7010         ss << "pool '" << pool_name << "' has no application '" << app << "'";
7011         r = -ENOENT;
7012         goto reply;
7013       }
7014       // filter by pool + app
7015       if (key.empty()) {
7016         f->open_object_section(app_it->first.c_str());
7017         for (auto &kv_pair : app_it->second) {
7018           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
7019         }
7020         f->close_section(); // application
7021         f->flush(rdata);
7022         goto reply;
7023       }
7024       // filter by pool + app + key
7025       auto key_it = app_it->second.find(key);
7026       if (key_it == app_it->second.end()) {
7027         ss << "application '" << app << "' on pool '" << pool_name
7028            << "' does not have key '" << key << "'";
7029         r = -ENOENT;
7030         goto reply;
7031       }
7032       ss << key_it->second << "\n";
7033       rdata.append(ss.str());
7034       ss.str("");
7035     }
7036   } else if (prefix == "osd get-require-min-compat-client") {
7037     ss << osdmap.require_min_compat_client << std::endl;
7038     rdata.append(ss.str());
7039     ss.str("");
7040     goto reply;
7041   } else if (prefix == "osd pool application enable" ||
7042              prefix == "osd pool application disable" ||
7043              prefix == "osd pool application set" ||
7044              prefix == "osd pool application rm") {
7045     bool changed = false;
7046     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
7047     if (r != 0) {
7048       // Error, reply.
7049       goto reply;
7050     } else if (changed) {
7051       // Valid mutation, proceed to prepare phase
7052       return false;
7053     } else {
7054       // Idempotent case, reply
7055       goto reply;
7056     }
7057   } else {
7058     // try prepare update
7059     return false;
7060   }
7061
7062  reply:
7063   string rs;
7064   getline(ss, rs);
7065   mon.reply_command(op, r, rs, rdata, get_last_committed());
7066   return true;
7067 }
7068
7069 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
7070 {
7071   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7072     osdmap.get_pg_pool(pool_id));
7073   ceph_assert(pool);
7074   pool->set_flag(flags);
7075 }
7076
7077 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
7078 {
7079   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7080     osdmap.get_pg_pool(pool_id));
7081   ceph_assert(pool);
7082   pool->unset_flag(flags);
7083 }
7084
7085 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7086 {
7087   char k[80];
7088   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7089   return k;
7090 }
7091
7092 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7093 {
7094   char k[80];
7095   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7096            (unsigned long long)pool, (unsigned long long)snap);
7097   return k;
7098 }
7099
7100 string OSDMonitor::make_purged_snap_key_value(
7101   int64_t pool, snapid_t snap, snapid_t num,
7102   epoch_t epoch, bufferlist *v)
7103 {
7104   // encode the *last* epoch in the key so that we can use forward
7105   // iteration only to search for an epoch in an interval.
7106   encode(snap, *v);
7107   encode(snap + num, *v);
7108   encode(epoch, *v);
7109   return make_purged_snap_key(pool, snap + num - 1);
7110 }
7111
7112
7113 int OSDMonitor::lookup_purged_snap(
7114   int64_t pool, snapid_t snap,
7115   snapid_t *begin, snapid_t *end)
7116 {
7117   string k = make_purged_snap_key(pool, snap);
7118   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7119   it->lower_bound(k);
7120   if (!it->valid()) {
7121     dout(20) << __func__
7122              << " pool " << pool << " snap " << snap
7123              << " - key '" << k << "' not found" << dendl;
7124     return -ENOENT;
7125   }
7126   if (it->key().find("purged_snap_") != 0) {
7127     dout(20) << __func__
7128              << " pool " << pool << " snap " << snap
7129              << " - key '" << k << "' got '" << it->key()
7130              << "', wrong prefix" << dendl;
7131     return -ENOENT;
7132   }
7133   string gotk = it->key();
7134   const char *format = "purged_snap_%llu_";
7135   long long int keypool;
7136   int n = sscanf(gotk.c_str(), format, &keypool);
7137   if (n != 1) {
7138     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7139     return -ENOENT;
7140   }
7141   if (pool != keypool) {
7142     dout(20) << __func__
7143              << " pool " << pool << " snap " << snap
7144              << " - key '" << k << "' got '" << gotk
7145              << "', wrong pool " << keypool
7146              << dendl;
7147     return -ENOENT;
7148   }
7149   bufferlist v = it->value();
7150   auto p = v.cbegin();
7151   decode(*begin, p);
7152   decode(*end, p);
7153   if (snap < *begin || snap >= *end) {
7154     dout(20) << __func__
7155              << " pool " << pool << " snap " << snap
7156              << " - found [" << *begin << "," << *end << "), no overlap"
7157              << dendl;
7158     return -ENOENT;
7159   }
7160   return 0;
7161 }
7162
7163 void OSDMonitor::insert_purged_snap_update(
7164   int64_t pool,
7165   snapid_t start, snapid_t end,
7166   epoch_t epoch,
7167   MonitorDBStore::TransactionRef t)
7168 {
7169   snapid_t before_begin, before_end;
7170   snapid_t after_begin, after_end;
7171   int b = lookup_purged_snap(pool, start - 1,
7172                              &before_begin, &before_end);
7173   int a = lookup_purged_snap(pool, end,
7174                              &after_begin, &after_end);
7175   if (!b && !a) {
7176     dout(10) << __func__
7177              << " [" << start << "," << end << ") - joins ["
7178              << before_begin << "," << before_end << ") and ["
7179              << after_begin << "," << after_end << ")" << dendl;
7180     // erase only the begin record; we'll overwrite the end one.
7181     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7182     bufferlist v;
7183     string k = make_purged_snap_key_value(pool,
7184                                           before_begin, after_end - before_begin,
7185                                           pending_inc.epoch, &v);
7186     t->put(OSD_SNAP_PREFIX, k, v);
7187   } else if (!b) {
7188     dout(10) << __func__
7189              << " [" << start << "," << end << ") - join with earlier ["
7190              << before_begin << "," << before_end << ")" << dendl;
7191     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7192     bufferlist v;
7193     string k = make_purged_snap_key_value(pool,
7194                                           before_begin, end - before_begin,
7195                                           pending_inc.epoch, &v);
7196     t->put(OSD_SNAP_PREFIX, k, v);
7197   } else if (!a) {
7198     dout(10) << __func__
7199              << " [" << start << "," << end << ") - join with later ["
7200              << after_begin << "," << after_end << ")" << dendl;
7201     // overwrite after record
7202     bufferlist v;
7203     string k = make_purged_snap_key_value(pool,
7204                                           start, after_end - start,
7205                                           pending_inc.epoch, &v);
7206     t->put(OSD_SNAP_PREFIX, k, v);
7207   } else {
7208     dout(10) << __func__
7209              << " [" << start << "," << end << ") - new"
7210              << dendl;
7211     bufferlist v;
7212     string k = make_purged_snap_key_value(pool,
7213                                           start, end - start,
7214                                           pending_inc.epoch, &v);
7215     t->put(OSD_SNAP_PREFIX, k, v);
7216   }
7217 }
7218
7219 bool OSDMonitor::try_prune_purged_snaps()
7220 {
7221   if (!mon.mgrstatmon()->is_readable()) {
7222     return false;
7223   }
7224   if (!pending_inc.new_purged_snaps.empty()) {
7225     return false;  // we already pruned for this epoch
7226   }
7227
7228   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7229     "mon_max_snap_prune_per_epoch");
7230   if (!max_prune) {
7231     max_prune = 100000;
7232   }
7233   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7234
7235   unsigned actually_pruned = 0;
7236   auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7237   for (auto& p : osdmap.get_pools()) {
7238     auto q = purged_snaps.find(p.first);
7239     if (q == purged_snaps.end()) {
7240       continue;
7241     }
7242     auto& purged = q->second;
7243     if (purged.empty()) {
7244       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7245       continue;
7246     }
7247     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7248     snap_interval_set_t to_prune;
7249     unsigned maybe_pruned = actually_pruned;
7250     for (auto i = purged.begin(); i != purged.end(); ++i) {
7251       snapid_t begin = i.get_start();
7252       auto end = i.get_start() + i.get_len();
7253       snapid_t pbegin = 0, pend = 0;
7254       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7255       if (r == 0) {
7256         // already purged.
7257         // be a bit aggressive about backing off here, because the mon may
7258         // do a lot of work going through this set, and if we know the
7259         // purged set from the OSDs is at least *partly* stale we may as
7260         // well wait for it to be fresh.
7261         dout(20) << __func__ << "  we've already purged " << pbegin
7262                  << "~" << (pend - pbegin) << dendl;
7263         break;  // next pool
7264       }
7265       if (pbegin && pbegin > begin && pbegin < end) {
7266         // the tail of [begin,end) is purged; shorten the range
7267         end = pbegin;
7268       }
7269       to_prune.insert(begin, end - begin);
7270       maybe_pruned += end - begin;
7271       if (maybe_pruned >= max_prune) {
7272         break;
7273       }
7274     }
7275     if (!to_prune.empty()) {
7276       // PGs may still be reporting things as purged that we have already
7277       // pruned from removed_snaps_queue.
7278       snap_interval_set_t actual;
7279       auto r = osdmap.removed_snaps_queue.find(p.first);
7280       if (r != osdmap.removed_snaps_queue.end()) {
7281         actual.intersection_of(to_prune, r->second);
7282       }
7283       actually_pruned += actual.size();
7284       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7285                << ", actual pruned " << actual << dendl;
7286       if (!actual.empty()) {
7287         pending_inc.new_purged_snaps[p.first].swap(actual);
7288       }
7289     }
7290     if (actually_pruned >= max_prune) {
7291       break;
7292     }
7293   }
7294   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7295   return !!actually_pruned;
7296 }
7297
7298 bool OSDMonitor::update_pools_status()
7299 {
7300   if (!mon.mgrstatmon()->is_readable())
7301     return false;
7302
7303   bool ret = false;
7304
7305   auto& pools = osdmap.get_pools();
7306   for (auto it = pools.begin(); it != pools.end(); ++it) {
7307     const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7308     if (!pstat)
7309       continue;
7310     const object_stat_sum_t& sum = pstat->stats.sum;
7311     const pg_pool_t &pool = it->second;
7312     const string& pool_name = osdmap.get_pool_name(it->first);
7313
7314     bool pool_is_full =
7315       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7316       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7317
7318     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7319       if (pool_is_full)
7320         continue;
7321
7322       mon.clog->info() << "pool '" << pool_name
7323                        << "' no longer out of quota; removing NO_QUOTA flag";
7324       // below we cancel FLAG_FULL too, we'll set it again in
7325       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7326       clear_pool_flags(it->first,
7327                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7328       ret = true;
7329     } else {
7330       if (!pool_is_full)
7331         continue;
7332
7333       if (pool.quota_max_bytes > 0 &&
7334           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7335         mon.clog->warn() << "pool '" << pool_name << "' is full"
7336                          << " (reached quota's max_bytes: "
7337                          << byte_u_t(pool.quota_max_bytes) << ")";
7338       }
7339       if (pool.quota_max_objects > 0 &&
7340                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7341         mon.clog->warn() << "pool '" << pool_name << "' is full"
7342                          << " (reached quota's max_objects: "
7343                          << pool.quota_max_objects << ")";
7344       }
7345       // set both FLAG_FULL_QUOTA and FLAG_FULL
7346       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7347       // since FLAG_FULL should always take precedence
7348       set_pool_flags(it->first,
7349                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7350       clear_pool_flags(it->first,
7351                        pg_pool_t::FLAG_NEARFULL |
7352                        pg_pool_t::FLAG_BACKFILLFULL);
7353       ret = true;
7354     }
7355   }
7356   return ret;
7357 }
7358
7359 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7360 {
7361   op->mark_osdmon_event(__func__);
7362   auto m = op->get_req<MPoolOp>();
7363   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7364   MonSession *session = op->get_session();
7365   if (!session)
7366     return -EPERM;
7367   string erasure_code_profile;
7368   stringstream ss;
7369   string rule_name;
7370   bool bulk = false;
7371   int ret = 0;
7372   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7373                          0, 0, 0, 0, 0, 0, 0.0,
7374                          erasure_code_profile,
7375                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {}, bulk,
7376                          &ss);
7377
7378   if (ret < 0) {
7379     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7380   }
7381   return ret;
7382 }
7383
7384 int OSDMonitor::crush_rename_bucket(const string& srcname,
7385                                     const string& dstname,
7386                                     ostream *ss)
7387 {
7388   int ret;
7389   //
7390   // Avoid creating a pending crush if it does not already exists and
7391   // the rename would fail.
7392   //
7393   if (!_have_pending_crush()) {
7394     ret = _get_stable_crush().can_rename_bucket(srcname,
7395                                                 dstname,
7396                                                 ss);
7397     if (ret)
7398       return ret;
7399   }
7400
7401   CrushWrapper newcrush = _get_pending_crush();
7402
7403   ret = newcrush.rename_bucket(srcname,
7404                                dstname,
7405                                ss);
7406   if (ret)
7407     return ret;
7408
7409   pending_inc.crush.clear();
7410   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7411   *ss << "renamed bucket " << srcname << " into " << dstname;
7412   return 0;
7413 }
7414
7415 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7416 {
7417   string replacement = "";
7418
7419   if (plugin == "jerasure_generic" ||
7420       plugin == "jerasure_sse3" ||
7421       plugin == "jerasure_sse4" ||
7422       plugin == "jerasure_neon") {
7423     replacement = "jerasure";
7424   } else if (plugin == "shec_generic" ||
7425              plugin == "shec_sse3" ||
7426              plugin == "shec_sse4" ||
7427              plugin == "shec_neon") {
7428     replacement = "shec";
7429   }
7430
7431   if (replacement != "") {
7432     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7433             << plugin << " that has been deprecated. Please use "
7434             << replacement << " instead." << dendl;
7435   }
7436 }
7437
7438 int OSDMonitor::normalize_profile(const string& profilename,
7439                                   ErasureCodeProfile &profile,
7440                                   bool force,
7441                                   ostream *ss)
7442 {
7443   ErasureCodeInterfaceRef erasure_code;
7444   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7445   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7446   check_legacy_ec_plugin(plugin->second, profilename);
7447   int err = instance.factory(plugin->second,
7448                              g_conf().get_val<std::string>("erasure_code_dir"),
7449                              profile, &erasure_code, ss);
7450   if (err) {
7451     return err;
7452   }
7453
7454   err = erasure_code->init(profile, ss);
7455   if (err) {
7456     return err;
7457   }
7458
7459   auto it = profile.find("stripe_unit");
7460   if (it != profile.end()) {
7461     string err_str;
7462     uint32_t stripe_unit = strict_iecstrtoll(it->second, &err_str);
7463     if (!err_str.empty()) {
7464       *ss << "could not parse stripe_unit '" << it->second
7465           << "': " << err_str << std::endl;
7466       return -EINVAL;
7467     }
7468     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7469     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7470     if (chunk_size != stripe_unit) {
7471       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7472           << "alignment. Would be padded to " << chunk_size
7473           << std::endl;
7474       return -EINVAL;
7475     }
7476     if ((stripe_unit % 4096) != 0 && !force) {
7477       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7478           << "use --force to override this check" << std::endl;
7479       return -EINVAL;
7480     }
7481   }
7482   return 0;
7483 }
7484
7485 int OSDMonitor::crush_rule_create_erasure(const string &name,
7486                                              const string &profile,
7487                                              int *rule,
7488                                              ostream *ss)
7489 {
7490   int ruleid = osdmap.crush->get_rule_id(name);
7491   if (ruleid != -ENOENT) {
7492     *rule = ruleid;
7493     return -EEXIST;
7494   }
7495
7496   CrushWrapper newcrush = _get_pending_crush();
7497
7498   ruleid = newcrush.get_rule_id(name);
7499   if (ruleid != -ENOENT) {
7500     *rule = ruleid;
7501     return -EALREADY;
7502   } else {
7503     ErasureCodeInterfaceRef erasure_code;
7504     int err = get_erasure_code(profile, &erasure_code, ss);
7505     if (err) {
7506       *ss << "failed to load plugin using profile " << profile << std::endl;
7507       return err;
7508     }
7509
7510     err = erasure_code->create_rule(name, newcrush, ss);
7511     erasure_code.reset();
7512     if (err < 0)
7513       return err;
7514     *rule = err;
7515     pending_inc.crush.clear();
7516     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7517     return 0;
7518   }
7519 }
7520
7521 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7522                                  ErasureCodeInterfaceRef *erasure_code,
7523                                  ostream *ss) const
7524 {
7525   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7526     return -EAGAIN;
7527   ErasureCodeProfile profile =
7528     osdmap.get_erasure_code_profile(erasure_code_profile);
7529   ErasureCodeProfile::const_iterator plugin =
7530     profile.find("plugin");
7531   if (plugin == profile.end()) {
7532     *ss << "cannot determine the erasure code plugin"
7533         << " because there is no 'plugin' entry in the erasure_code_profile "
7534         << profile << std::endl;
7535     return -EINVAL;
7536   }
7537   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7538   auto& instance = ErasureCodePluginRegistry::instance();
7539   return instance.factory(plugin->second,
7540                           g_conf().get_val<std::string>("erasure_code_dir"),
7541                           profile, erasure_code, ss);
7542 }
7543
7544 int OSDMonitor::check_cluster_features(uint64_t features,
7545                                        stringstream &ss)
7546 {
7547   stringstream unsupported_ss;
7548   int unsupported_count = 0;
7549   if ((mon.get_quorum_con_features() & features) != features) {
7550     unsupported_ss << "the monitor cluster";
7551     ++unsupported_count;
7552   }
7553
7554   set<int32_t> up_osds;
7555   osdmap.get_up_osds(up_osds);
7556   for (set<int32_t>::iterator it = up_osds.begin();
7557        it != up_osds.end(); ++it) {
7558     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7559     if ((xi.features & features) != features) {
7560       if (unsupported_count > 0)
7561         unsupported_ss << ", ";
7562       unsupported_ss << "osd." << *it;
7563       unsupported_count ++;
7564     }
7565   }
7566
7567   if (unsupported_count > 0) {
7568     ss << "features " << features << " unsupported by: "
7569        << unsupported_ss.str();
7570     return -ENOTSUP;
7571   }
7572
7573   // check pending osd state, too!
7574   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7575          pending_inc.new_xinfo.begin();
7576        p != pending_inc.new_xinfo.end(); ++p) {
7577     const osd_xinfo_t &xi = p->second;
7578     if ((xi.features & features) != features) {
7579       dout(10) << __func__ << " pending osd." << p->first
7580                << " features are insufficient; retry" << dendl;
7581       return -EAGAIN;
7582     }
7583   }
7584
7585   return 0;
7586 }
7587
7588 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7589                                                  stringstream& ss)
7590 {
7591   OSDMap::Incremental new_pending = pending_inc;
7592   encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7593   OSDMap newmap;
7594   newmap.deepish_copy_from(osdmap);
7595   newmap.apply_incremental(new_pending);
7596
7597   // client compat
7598   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7599     auto mv = newmap.get_min_compat_client();
7600     if (mv > newmap.require_min_compat_client) {
7601       ss << "new crush map requires client version " << mv
7602          << " but require_min_compat_client is "
7603          << newmap.require_min_compat_client;
7604       return false;
7605     }
7606   }
7607
7608   // osd compat
7609   uint64_t features =
7610     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7611     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7612   stringstream features_ss;
7613   int r = check_cluster_features(features, features_ss);
7614   if (r) {
7615     ss << "Could not change CRUSH: " << features_ss.str();
7616     return false;
7617   }
7618
7619   return true;
7620 }
7621
7622 bool OSDMonitor::erasure_code_profile_in_use(
7623   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7624   const string &profile,
7625   ostream *ss)
7626 {
7627   bool found = false;
7628   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7629        p != pools.end();
7630        ++p) {
7631     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7632       *ss << osdmap.pool_name[p->first] << " ";
7633       found = true;
7634     }
7635   }
7636   if (found) {
7637     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7638   }
7639   return found;
7640 }
7641
7642 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7643                                            map<string,string> *erasure_code_profile_map,
7644                                            ostream *ss)
7645 {
7646   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7647                                    get_json_str_map,
7648                                    *ss,
7649                                    erasure_code_profile_map,
7650                                    true);
7651   if (r)
7652     return r;
7653   ceph_assert((*erasure_code_profile_map).count("plugin"));
7654   string default_plugin = (*erasure_code_profile_map)["plugin"];
7655   map<string,string> user_map;
7656   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7657        i != erasure_code_profile.end();
7658        ++i) {
7659     size_t equal = i->find('=');
7660     if (equal == string::npos) {
7661       user_map[*i] = string();
7662       (*erasure_code_profile_map)[*i] = string();
7663     } else {
7664       const string key = i->substr(0, equal);
7665       equal++;
7666       const string value = i->substr(equal);
7667       if (key.find("ruleset-") == 0) {
7668         *ss << "property '" << key << "' is no longer supported; try "
7669             << "'crush-" << key.substr(8) << "' instead";
7670         return -EINVAL;
7671       }
7672       user_map[key] = value;
7673       (*erasure_code_profile_map)[key] = value;
7674     }
7675   }
7676
7677   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7678     (*erasure_code_profile_map) = user_map;
7679
7680   return 0;
7681 }
7682
7683 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7684                                   const string &erasure_code_profile,
7685                                   uint8_t repl_size,
7686                                   unsigned *size, unsigned *min_size,
7687                                   ostream *ss)
7688 {
7689   int err = 0;
7690   bool set_min_size = false;
7691   switch (pool_type) {
7692   case pg_pool_t::TYPE_REPLICATED:
7693     if (osdmap.stretch_mode_enabled) {
7694       if (repl_size == 0)
7695         repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7696       if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7697         *ss << "prepare_pool_size: we are in stretch mode but size "
7698            << repl_size << " does not match!";
7699         return -EINVAL;
7700       }
7701       *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7702       set_min_size = true;
7703     }
7704     if (repl_size == 0) {
7705       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7706     }
7707     *size = repl_size;
7708     if (!set_min_size)
7709       *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7710     break;
7711   case pg_pool_t::TYPE_ERASURE:
7712     {
7713       if (osdmap.stretch_mode_enabled) {
7714         *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7715         return -EINVAL;
7716       }
7717       ErasureCodeInterfaceRef erasure_code;
7718       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7719       if (err == 0) {
7720         *size = erasure_code->get_chunk_count();
7721         *min_size =
7722           erasure_code->get_data_chunk_count() +
7723           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7724         assert(*min_size <= *size);
7725         assert(*min_size >= erasure_code->get_data_chunk_count());
7726       }
7727     }
7728     break;
7729   default:
7730     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7731     err = -EINVAL;
7732     break;
7733   }
7734   return err;
7735 }
7736
7737 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7738                                           const string &erasure_code_profile,
7739                                           uint32_t *stripe_width,
7740                                           ostream *ss)
7741 {
7742   int err = 0;
7743   switch (pool_type) {
7744   case pg_pool_t::TYPE_REPLICATED:
7745     // ignored
7746     break;
7747   case pg_pool_t::TYPE_ERASURE:
7748     {
7749       ErasureCodeProfile profile =
7750         osdmap.get_erasure_code_profile(erasure_code_profile);
7751       ErasureCodeInterfaceRef erasure_code;
7752       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7753       if (err)
7754         break;
7755       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7756       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7757       auto it = profile.find("stripe_unit");
7758       if (it != profile.end()) {
7759         string err_str;
7760         stripe_unit = strict_iecstrtoll(it->second, &err_str);
7761         ceph_assert(err_str.empty());
7762       }
7763       *stripe_width = data_chunks *
7764         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7765     }
7766     break;
7767   default:
7768     *ss << "prepare_pool_stripe_width: "
7769        << pool_type << " is not a known pool type";
7770     err = -EINVAL;
7771     break;
7772   }
7773   return err;
7774 }
7775
7776 int OSDMonitor::get_replicated_stretch_crush_rule()
7777 {
7778   /* we don't write down the stretch rule anywhere, so
7779    * we have to guess it. How? Look at all the pools
7780    * and count up how many times a given rule is used
7781    * on stretch pools and then return the one with
7782    * the most users!
7783    */
7784   map<int,int> rule_counts;
7785   for (const auto& pooli : osdmap.pools) {
7786     const pg_pool_t& p = pooli.second;
7787     if (p.is_replicated() && p.is_stretch_pool()) {
7788       if (!rule_counts.count(p.crush_rule)) {
7789         rule_counts[p.crush_rule] = 1;
7790       } else {
7791         ++rule_counts[p.crush_rule];
7792       }
7793     }
7794   }
7795
7796   if (rule_counts.empty()) {
7797     return -ENOENT;
7798   }
7799
7800   int most_used_count = 0;
7801   int most_used_rule = -1;
7802   for (auto i : rule_counts) {
7803     if (i.second > most_used_count) {
7804       most_used_rule = i.first;
7805       most_used_count = i.second;
7806     }
7807   }
7808   ceph_assert(most_used_count > 0);
7809   ceph_assert(most_used_rule >= 0);
7810   return most_used_rule;
7811 }
7812
7813 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7814                                         const string &erasure_code_profile,
7815                                         const string &rule_name,
7816                                         int *crush_rule,
7817                                         ostream *ss)
7818 {
7819
7820   if (*crush_rule < 0) {
7821     switch (pool_type) {
7822     case pg_pool_t::TYPE_REPLICATED:
7823       {
7824         if (rule_name == "") {
7825           if (osdmap.stretch_mode_enabled) {
7826             *crush_rule = get_replicated_stretch_crush_rule();
7827           } else {
7828             // Use default rule
7829             *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_rule(cct);
7830           }
7831           if (*crush_rule < 0) {
7832             // Errors may happen e.g. if no valid rule is available
7833             *ss << "No suitable CRUSH rule exists, check "
7834                 << "'osd pool default crush *' config options";
7835             return -ENOENT;
7836           }
7837         } else {
7838           return get_crush_rule(rule_name, crush_rule, ss);
7839         }
7840       }
7841       break;
7842     case pg_pool_t::TYPE_ERASURE:
7843       {
7844         int err = crush_rule_create_erasure(rule_name,
7845                                                erasure_code_profile,
7846                                                crush_rule, ss);
7847         switch (err) {
7848         case -EALREADY:
7849           dout(20) << "prepare_pool_crush_rule: rule "
7850                    << rule_name << " try again" << dendl;
7851           // fall through
7852         case 0:
7853           // need to wait for the crush rule to be proposed before proceeding
7854           err = -EAGAIN;
7855           break;
7856         case -EEXIST:
7857           err = 0;
7858           break;
7859         }
7860         return err;
7861       }
7862       break;
7863     default:
7864       *ss << "prepare_pool_crush_rule: " << pool_type
7865          << " is not a known pool type";
7866       return -EINVAL;
7867     }
7868   } else {
7869     if (!osdmap.crush->rule_exists(*crush_rule)) {
7870       *ss << "CRUSH rule " << *crush_rule << " not found";
7871       return -ENOENT;
7872     }
7873   }
7874
7875   return 0;
7876 }
7877
7878 int OSDMonitor::get_crush_rule(const string &rule_name,
7879                                int *crush_rule,
7880                                ostream *ss)
7881 {
7882   int ret;
7883   ret = osdmap.crush->get_rule_id(rule_name);
7884   if (ret != -ENOENT) {
7885     // found it, use it
7886     *crush_rule = ret;
7887   } else {
7888     CrushWrapper newcrush = _get_pending_crush();
7889
7890     ret = newcrush.get_rule_id(rule_name);
7891     if (ret != -ENOENT) {
7892       // found it, wait for it to be proposed
7893       dout(20) << __func__ << ": rule " << rule_name
7894                << " try again" << dendl;
7895       return -EAGAIN;
7896     } else {
7897       // Cannot find it , return error
7898       *ss << "specified rule " << rule_name << " doesn't exist";
7899       return ret;
7900     }
7901   }
7902   return 0;
7903 }
7904
7905 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, int crush_rule, ostream *ss)
7906 {
7907   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7908   uint64_t projected = 0;
7909   unsigned osd_num = 0;
7910   // assume min cluster size 3
7911   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);
7912   if (pool < 0) {
7913     // a new pool
7914     projected += pg_num * size;
7915   }
7916   if (mapping.get_epoch() >= osdmap.get_epoch()) {
7917     set<int> roots;
7918     CrushWrapper newcrush = _get_pending_crush();
7919     newcrush.find_takes_by_rule(crush_rule, &roots);
7920     int max_osd = osdmap.get_max_osd();
7921     for (auto root : roots) {
7922       const char *rootname = newcrush.get_item_name(root);
7923       set<int> osd_ids;
7924       newcrush.get_leaves(rootname, &osd_ids);
7925       unsigned out_osd = 0;
7926       for (auto id : osd_ids) {
7927         if (id > max_osd) {
7928           out_osd++;
7929           continue;
7930         }
7931         projected += mapping.get_osd_acting_pgs(id).size();
7932       }
7933       osd_num += osd_ids.size() - out_osd;
7934     }
7935     if (pool >= 0) {
7936       // update an existing pool's pg num
7937       const auto& pg_info = osdmap.get_pools().at(pool);
7938       // already counted the pgs of this `pool` by iterating crush map, so
7939       // remove them using adding the specified pg num
7940       projected += pg_num * size;
7941       projected -= pg_info.get_pg_num_target() * pg_info.get_size();
7942     }
7943     num_osds = std::max(osd_num, 3u);  // assume min cluster size 3
7944   } else {
7945     // use pg_num target for evaluating the projected pg num
7946     for (const auto& [pool_id, pool_info] : osdmap.get_pools()) {
7947       if (pool_id == pool) {
7948         projected += pg_num * size;
7949       } else {
7950         projected += pool_info.get_pg_num_target() * pool_info.get_size();
7951       }
7952     }
7953   }
7954   auto max_pgs = max_pgs_per_osd * num_osds;
7955   if (projected > max_pgs) {
7956     if (pool >= 0) {
7957       *ss << "pool id " << pool;
7958     }
7959     *ss << " pg_num " << pg_num << " size " << size
7960         << " would mean " << projected
7961         << " total pgs, which exceeds max " << max_pgs
7962         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7963         << " * num_in_osds " << num_osds << ")";
7964     return -ERANGE;
7965   }
7966   return 0;
7967 }
7968
7969 /**
7970  * @param name The name of the new pool
7971  * @param crush_rule The crush rule to use. If <0, will use the system default
7972  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7973  * @param pg_num The pg_num to use. If set to 0, will use the system default
7974  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7975  * @param pg_num_min min pg_num
7976  * @param pg_num_max max pg_num
7977  * @param repl_size Replication factor, or 0 for default
7978  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7979  * @param pool_type TYPE_ERASURE, or TYPE_REP
7980  * @param expected_num_objects expected number of objects on the pool
7981  * @param fast_read fast read type.
7982  * @param ss human readable error message, if any.
7983  *
7984  * @return 0 on success, negative errno on failure.
7985  */
7986 int OSDMonitor::prepare_new_pool(string& name,
7987                                  int crush_rule,
7988                                  const string &crush_rule_name,
7989                                  unsigned pg_num, unsigned pgp_num,
7990                                  unsigned pg_num_min,
7991                                  unsigned pg_num_max,
7992                                  const uint64_t repl_size,
7993                                  const uint64_t target_size_bytes,
7994                                  const float target_size_ratio,
7995                                  const string &erasure_code_profile,
7996                                  const unsigned pool_type,
7997                                  const uint64_t expected_num_objects,
7998                                  FastReadType fast_read,
7999                                  const string& pg_autoscale_mode,
8000                                  bool bulk,
8001                                  ostream *ss)
8002 {
8003   if (name.length() == 0)
8004     return -EINVAL;
8005   if (pg_num == 0) {
8006     auto pg_num_from_mode =
8007       [pg_num=g_conf().get_val<uint64_t>("osd_pool_default_pg_num")]
8008       (const string& mode) {
8009       return mode == "on" ? 1 : pg_num;
8010     };
8011     pg_num = pg_num_from_mode(
8012       pg_autoscale_mode.empty() ?
8013       g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode") :
8014       pg_autoscale_mode);
8015   }
8016   if (pgp_num == 0)
8017     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
8018   if (!pgp_num)
8019     pgp_num = pg_num;
8020   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8021     *ss << "'pg_num' must be greater than 0 and less than or equal to "
8022         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8023         << " (you may adjust 'mon max pool pg num' for higher values)";
8024     return -ERANGE;
8025   }
8026   if (pgp_num > pg_num) {
8027     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
8028         << ", which in this case is " << pg_num;
8029     return -ERANGE;
8030   }
8031   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
8032     *ss << "'fast_read' can only apply to erasure coding pool";
8033     return -EINVAL;
8034   }
8035   int r;
8036   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
8037                                  crush_rule_name, &crush_rule, ss);
8038   if (r) {
8039     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
8040     return r;
8041   }
8042   unsigned size, min_size;
8043   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
8044                         &size, &min_size, ss);
8045   if (r) {
8046     dout(10) << "prepare_pool_size returns " << r << dendl;
8047     return r;
8048   }
8049   if (g_conf()->mon_osd_crush_smoke_test) {
8050     CrushWrapper newcrush = _get_pending_crush();
8051     ostringstream err;
8052     CrushTester tester(newcrush, err);
8053     tester.set_min_x(0);
8054     tester.set_max_x(50);
8055     tester.set_rule(crush_rule);
8056     tester.set_num_rep(size);
8057     auto start = ceph::coarse_mono_clock::now();
8058     r = tester.test_with_fork(g_conf()->mon_lease);
8059     auto duration = ceph::coarse_mono_clock::now() - start;
8060     if (r < 0) {
8061       dout(10) << "tester.test_with_fork returns " << r
8062                << ": " << err.str() << dendl;
8063       *ss << "crush test failed with " << r << ": " << err.str();
8064       return r;
8065     }
8066     dout(10) << __func__ << " crush smoke test duration: "
8067              << duration << dendl;
8068   }
8069   r = check_pg_num(-1, pg_num, size, crush_rule, ss);
8070   if (r) {
8071     dout(10) << "check_pg_num returns " << r << dendl;
8072     return r;
8073   }
8074
8075   if (osdmap.crush->get_rule_type(crush_rule) != (int)pool_type) {
8076     *ss << "crush rule " << crush_rule << " type does not match pool";
8077     return -EINVAL;
8078   }
8079
8080   uint32_t stripe_width = 0;
8081   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
8082   if (r) {
8083     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
8084     return r;
8085   }
8086
8087   bool fread = false;
8088   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8089     switch (fast_read) {
8090       case FAST_READ_OFF:
8091         fread = false;
8092         break;
8093       case FAST_READ_ON:
8094         fread = true;
8095         break;
8096       case FAST_READ_DEFAULT:
8097         fread = g_conf()->osd_pool_default_ec_fast_read;
8098         break;
8099       default:
8100         *ss << "invalid fast_read setting: " << fast_read;
8101         return -EINVAL;
8102     }
8103   }
8104
8105   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
8106        p != pending_inc.new_pool_names.end();
8107        ++p) {
8108     if (p->second == name)
8109       return 0;
8110   }
8111
8112   if (-1 == pending_inc.new_pool_max)
8113     pending_inc.new_pool_max = osdmap.pool_max;
8114   int64_t pool = ++pending_inc.new_pool_max;
8115   pg_pool_t empty;
8116   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
8117   pi->create_time = ceph_clock_now();
8118   pi->type = pool_type;
8119   pi->fast_read = fread;
8120   pi->flags = g_conf()->osd_pool_default_flags;
8121   if (bulk) {
8122     pi->set_flag(pg_pool_t::FLAG_BULK);
8123   } else if (g_conf()->osd_pool_default_flag_bulk) {
8124       pi->set_flag(pg_pool_t::FLAG_BULK);
8125   }
8126   if (g_conf()->osd_pool_default_flag_hashpspool)
8127     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
8128   if (g_conf()->osd_pool_default_flag_nodelete)
8129     pi->set_flag(pg_pool_t::FLAG_NODELETE);
8130   if (g_conf()->osd_pool_default_flag_nopgchange)
8131     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
8132   if (g_conf()->osd_pool_default_flag_nosizechange)
8133     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
8134   pi->set_flag(pg_pool_t::FLAG_CREATING);
8135   if (g_conf()->osd_pool_use_gmt_hitset)
8136     pi->use_gmt_hitset = true;
8137   else
8138     pi->use_gmt_hitset = false;
8139
8140   pi->size = size;
8141   pi->min_size = min_size;
8142   pi->crush_rule = crush_rule;
8143   pi->expected_num_objects = expected_num_objects;
8144   pi->object_hash = CEPH_STR_HASH_RJENKINS;
8145   if (osdmap.stretch_mode_enabled) {
8146     pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
8147     pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
8148     pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
8149     pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
8150     if (osdmap.degraded_stretch_mode) {
8151       pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
8152       pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
8153       // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
8154       // TODO: drat, we don't record this ^ anywhere, though given that it
8155       // necessarily won't exist elsewhere it likely doesn't matter
8156       pi->min_size = pi->min_size / 2;
8157       pi->size = pi->size / 2; // only support 2 zones now
8158     }
8159   }
8160
8161   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8162         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
8163       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8164     pi->pg_autoscale_mode = m;
8165   } else {
8166     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
8167   }
8168   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
8169   pi->set_pg_num(
8170     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8171     : pg_num);
8172   pi->set_pg_num_pending(pi->get_pg_num());
8173   pi->set_pg_num_target(pg_num);
8174   pi->set_pgp_num(pi->get_pg_num());
8175   pi->set_pgp_num_target(pgp_num);
8176   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8177       pg_num_min) {
8178     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8179   }
8180   if (osdmap.require_osd_release >= ceph_release_t::quincy &&
8181       pg_num_max) {
8182     pi->opts.set(pool_opts_t::PG_NUM_MAX, static_cast<int64_t>(pg_num_max));
8183   }
8184   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8185         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8186     pi->pg_autoscale_mode = m;
8187   }
8188
8189   pi->last_change = pending_inc.epoch;
8190   pi->auid = 0;
8191
8192   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8193       pi->erasure_code_profile = erasure_code_profile;
8194   } else {
8195       pi->erasure_code_profile = "";
8196   }
8197   pi->stripe_width = stripe_width;
8198
8199   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8200       target_size_bytes) {
8201     // only store for nautilus+ because TARGET_SIZE_BYTES may be
8202     // larger than int32_t max.
8203     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8204   }
8205   if (target_size_ratio > 0.0 &&
8206       osdmap.require_osd_release >= ceph_release_t::nautilus) {
8207     // only store for nautilus+, just to be consistent and tidy.
8208     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8209   }
8210
8211   pi->cache_target_dirty_ratio_micro =
8212     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8213   pi->cache_target_dirty_high_ratio_micro =
8214     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8215   pi->cache_target_full_ratio_micro =
8216     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8217   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8218   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8219
8220   pending_inc.new_pool_names[pool] = name;
8221   return 0;
8222 }
8223
8224 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8225 {
8226   op->mark_osdmon_event(__func__);
8227   ostringstream ss;
8228   if (pending_inc.new_flags < 0)
8229     pending_inc.new_flags = osdmap.get_flags();
8230   pending_inc.new_flags |= flag;
8231   ss << OSDMap::get_flag_string(flag) << " is set";
8232   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8233                                                     get_last_committed() + 1));
8234   return true;
8235 }
8236
8237 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8238 {
8239   op->mark_osdmon_event(__func__);
8240   ostringstream ss;
8241   if (pending_inc.new_flags < 0)
8242     pending_inc.new_flags = osdmap.get_flags();
8243   pending_inc.new_flags &= ~flag;
8244   ss << OSDMap::get_flag_string(flag) << " is unset";
8245   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8246                                                     get_last_committed() + 1));
8247   return true;
8248 }
8249
8250 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8251                                          stringstream& ss)
8252 {
8253   string poolstr;
8254   cmd_getval(cmdmap, "pool", poolstr);
8255   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8256   if (pool < 0) {
8257     ss << "unrecognized pool '" << poolstr << "'";
8258     return -ENOENT;
8259   }
8260   string var;
8261   cmd_getval(cmdmap, "var", var);
8262
8263   pg_pool_t p = *osdmap.get_pg_pool(pool);
8264   if (pending_inc.new_pools.count(pool))
8265     p = pending_inc.new_pools[pool];
8266
8267   // accept val as a json string in the normal case (current
8268   // generation monitor).  parse out int or float values from the
8269   // string as needed.  however, if it is not a string, try to pull
8270   // out an int, in case an older monitor with an older json schema is
8271   // forwarding a request.
8272   string val;
8273   string interr, floaterr;
8274   int64_t n = 0;
8275   double f = 0;
8276   int64_t uf = 0;  // micro-f
8277   cmd_getval(cmdmap, "val", val);
8278
8279   auto si_options = {
8280     "target_max_objects"
8281   };
8282   auto iec_options = {
8283     "target_max_bytes",
8284     "target_size_bytes",
8285     "compression_max_blob_size",
8286     "compression_min_blob_size",
8287     "csum_max_block",
8288     "csum_min_block",
8289   };
8290   if (count(begin(si_options), end(si_options), var)) {
8291     n = strict_si_cast<int64_t>(val, &interr);
8292   } else if (count(begin(iec_options), end(iec_options), var)) {
8293     n = strict_iec_cast<int64_t>(val, &interr);
8294   } else {
8295     // parse string as both int and float; different fields use different types.
8296     n = strict_strtoll(val.c_str(), 10, &interr);
8297     f = strict_strtod(val.c_str(), &floaterr);
8298     uf = llrintl(f * (double)1000000.0);
8299   }
8300
8301   if (!p.is_tier() &&
8302       (var == "hit_set_type" || var == "hit_set_period" ||
8303        var == "hit_set_count" || var == "hit_set_fpp" ||
8304        var == "target_max_objects" || var == "target_max_bytes" ||
8305        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8306        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8307        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8308        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8309        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8310     return -EACCES;
8311   }
8312
8313   if (var == "size") {
8314     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8315       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8316       return -EPERM;
8317     }
8318     if (p.type == pg_pool_t::TYPE_ERASURE) {
8319       ss << "can not change the size of an erasure-coded pool";
8320       return -ENOTSUP;
8321     }
8322     if (interr.length()) {
8323       ss << "error parsing integer value '" << val << "': " << interr;
8324       return -EINVAL;
8325     }
8326     if (n <= 0 || n > 10) {
8327       ss << "pool size must be between 1 and 10";
8328       return -EINVAL;
8329     }
8330     if (n == 1) {
8331       if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8332         ss << "configuring pool size as 1 is disabled by default.";
8333         return -EPERM;
8334       }
8335       bool sure = false;
8336       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8337       if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8338         "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8339           "pass the flag --yes-i-really-mean-it.";
8340         return -EPERM;
8341       }
8342     }
8343     if (osdmap.crush->get_rule_type(p.get_crush_rule()) != (int)p.type) {
8344       ss << "crush rule " << p.get_crush_rule() << " type does not match pool";
8345       return -EINVAL;
8346     }
8347     int r = check_pg_num(pool, p.get_pg_num(), n, p.get_crush_rule(), &ss);
8348     if (r < 0) {
8349       return r;
8350     }
8351     p.size = n;
8352     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8353   } else if (var == "min_size") {
8354     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8355       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8356       return -EPERM;
8357     }
8358     if (interr.length()) {
8359       ss << "error parsing integer value '" << val << "': " << interr;
8360       return -EINVAL;
8361     }
8362
8363     if (p.type != pg_pool_t::TYPE_ERASURE) {
8364       if (n < 1 || n > p.size) {
8365         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8366         return -EINVAL;
8367       }
8368     } else {
8369        ErasureCodeInterfaceRef erasure_code;
8370        int k;
8371        stringstream tmp;
8372        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8373        if (err == 0) {
8374          k = erasure_code->get_data_chunk_count();
8375        } else {
8376          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8377          return err;
8378        }
8379
8380        if (n < k || n > p.size) {
8381          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8382          return -EINVAL;
8383        }
8384     }
8385     p.min_size = n;
8386   } else if (var == "pg_num_actual") {
8387     if (interr.length()) {
8388       ss << "error parsing integer value '" << val << "': " << interr;
8389       return -EINVAL;
8390     }
8391     if (n == (int)p.get_pg_num()) {
8392       return 0;
8393     }
8394     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8395       ss << "'pg_num' must be greater than 0 and less than or equal to "
8396          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8397          << " (you may adjust 'mon max pool pg num' for higher values)";
8398       return -ERANGE;
8399     }
8400     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8401       ss << "cannot adjust pg_num while initial PGs are being created";
8402       return -EBUSY;
8403     }
8404     if (n > (int)p.get_pg_num()) {
8405       if (p.get_pg_num() != p.get_pg_num_pending()) {
8406         // force pre-nautilus clients to resend their ops, since they
8407         // don't understand pg_num_pending changes form a new interval
8408         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8409       }
8410       p.set_pg_num(n);
8411     } else {
8412       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8413         ss << "nautilus OSDs are required to adjust pg_num_pending";
8414         return -EPERM;
8415       }
8416       if (n < (int)p.get_pgp_num()) {
8417         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8418         return -EINVAL;
8419       }
8420       if (n < (int)p.get_pg_num() - 1) {
8421         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8422            << ") - 1; only single pg decrease is currently supported";
8423         return -EINVAL;
8424       }
8425       p.set_pg_num_pending(n);
8426       // force pre-nautilus clients to resend their ops, since they
8427       // don't understand pg_num_pending changes form a new interval
8428       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8429     }
8430     // force pre-luminous clients to resend their ops, since they
8431     // don't understand that split PGs now form a new interval.
8432     p.last_force_op_resend_preluminous = pending_inc.epoch;
8433   } else if (var == "pg_num") {
8434     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8435       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8436       return -EPERM;
8437     }
8438     if (interr.length()) {
8439       ss << "error parsing integer value '" << val << "': " << interr;
8440       return -EINVAL;
8441     }
8442     if (n == (int)p.get_pg_num_target()) {
8443       return 0;
8444     }
8445     if (n <= 0 || static_cast<uint64_t>(n) >
8446                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8447       ss << "'pg_num' must be greater than 0 and less than or equal to "
8448          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8449          << " (you may adjust 'mon max pool pg num' for higher values)";
8450       return -ERANGE;
8451     }
8452     if (n > (int)p.get_pg_num_target()) {
8453       int r = check_pg_num(pool, n, p.get_size(), p.get_crush_rule(), &ss);
8454       if (r) {
8455         return r;
8456       }
8457       bool force = false;
8458       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8459       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8460         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8461         return -EPERM;
8462       }
8463     } else {
8464       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8465         ss << "nautilus OSDs are required to decrease pg_num";
8466         return -EPERM;
8467       }
8468     }
8469     int64_t pg_min = 0, pg_max = 0;
8470     p.opts.get(pool_opts_t::PG_NUM_MIN, &pg_min);
8471     p.opts.get(pool_opts_t::PG_NUM_MAX, &pg_max);
8472     if (pg_min && n < pg_min) {
8473       ss << "specified pg_num " << n
8474          << " < pg_num_min " << pg_min;
8475       return -EINVAL;
8476     }
8477     if (pg_max && n > pg_max) {
8478       ss << "specified pg_num " << n
8479          << " < pg_num_max " << pg_max;
8480       return -EINVAL;
8481     }
8482     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8483       // pre-nautilus osdmap format; increase pg_num directly
8484       assert(n > (int)p.get_pg_num());
8485       // force pre-nautilus clients to resend their ops, since they
8486       // don't understand pg_num_target changes form a new interval
8487       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8488       // force pre-luminous clients to resend their ops, since they
8489       // don't understand that split PGs now form a new interval.
8490       p.last_force_op_resend_preluminous = pending_inc.epoch;
8491       p.set_pg_num(n);
8492     } else {
8493       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8494       // make pgp_num track pg_num if it already matches.  if it is set
8495       // differently, leave it different and let the user control it
8496       // manually.
8497       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8498         p.set_pgp_num_target(n);
8499       }
8500       p.set_pg_num_target(n);
8501     }
8502   } else if (var == "pgp_num_actual") {
8503     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8504       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8505       return -EPERM;
8506     }
8507     if (interr.length()) {
8508       ss << "error parsing integer value '" << val << "': " << interr;
8509       return -EINVAL;
8510     }
8511     if (n <= 0) {
8512       ss << "specified pgp_num must > 0, but you set to " << n;
8513       return -EINVAL;
8514     }
8515     if (n > (int)p.get_pg_num()) {
8516       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8517       return -EINVAL;
8518     }
8519     if (n > (int)p.get_pg_num_pending()) {
8520       ss << "specified pgp_num " << n
8521          << " > pg_num_pending " << p.get_pg_num_pending();
8522       return -EINVAL;
8523     }
8524     p.set_pgp_num(n);
8525   } else if (var == "pgp_num") {
8526     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8527       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8528       return -EPERM;
8529     }
8530     if (interr.length()) {
8531       ss << "error parsing integer value '" << val << "': " << interr;
8532       return -EINVAL;
8533     }
8534     if (n <= 0) {
8535       ss << "specified pgp_num must > 0, but you set to " << n;
8536       return -EINVAL;
8537     }
8538     if (n > (int)p.get_pg_num_target()) {
8539       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8540       return -EINVAL;
8541     }
8542     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8543       // pre-nautilus osdmap format; increase pgp_num directly
8544       p.set_pgp_num(n);
8545     } else {
8546       p.set_pgp_num_target(n);
8547     }
8548   } else if (var == "pg_autoscale_mode") {
8549     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8550     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8551       ss << "specified invalid mode " << val;
8552       return -EINVAL;
8553     }
8554     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8555       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8556       return -EINVAL;
8557     }
8558     p.pg_autoscale_mode = m;
8559   } else if (var == "crush_rule") {
8560     int id = osdmap.crush->get_rule_id(val);
8561     if (id == -ENOENT) {
8562       ss << "crush rule " << val << " does not exist";
8563       return -ENOENT;
8564     }
8565     if (id < 0) {
8566       ss << cpp_strerror(id);
8567       return -ENOENT;
8568     }
8569     if (osdmap.crush->get_rule_type(id) != (int)p.get_type()) {
8570       ss << "crush rule " << id << " type does not match pool";
8571       return -EINVAL;
8572     }
8573     p.crush_rule = id;
8574   } else if (var == "nodelete" || var == "nopgchange" ||
8575              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8576              var == "noscrub" || var == "nodeep-scrub" || var == "bulk") {
8577     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8578     // make sure we only compare against 'n' if we didn't receive a string
8579     if (val == "true" || (interr.empty() && n == 1)) {
8580       p.set_flag(flag);
8581     } else if (val == "false" || (interr.empty() && n == 0)) {
8582       p.unset_flag(flag);
8583     } else {
8584       ss << "expecting value 'true', 'false', '0', or '1'";
8585       return -EINVAL;
8586     }
8587   } else if (var == "eio") {
8588     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8589
8590     // make sure we only compare against 'n' if we didn't receive a string
8591     if (val == "true" || (interr.empty() && n == 1)) {
8592       p.set_flag(flag);
8593     } else if (val == "false" || (interr.empty() && n == 0)) {
8594       p.unset_flag(flag);
8595     } else {
8596       ss << "expecting value 'true', 'false', '0', or '1'";
8597       return -EINVAL;
8598     }
8599   } else if (var == "hashpspool") {
8600     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8601     bool force = false;
8602     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8603
8604     if (!force) {
8605       ss << "are you SURE?  this will remap all placement groups in this pool,"
8606             " this triggers large data movement,"
8607             " pass --yes-i-really-mean-it if you really do.";
8608       return -EPERM;
8609     }
8610     // make sure we only compare against 'n' if we didn't receive a string
8611     if (val == "true" || (interr.empty() && n == 1)) {
8612       p.set_flag(flag);
8613     } else if (val == "false" || (interr.empty() && n == 0)) {
8614       p.unset_flag(flag);
8615     } else {
8616       ss << "expecting value 'true', 'false', '0', or '1'";
8617       return -EINVAL;
8618     }
8619   } else if (var == "hit_set_type") {
8620     if (val == "none")
8621       p.hit_set_params = HitSet::Params();
8622     else {
8623       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8624       if (err)
8625         return err;
8626       if (val == "bloom") {
8627         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8628         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8629         p.hit_set_params = HitSet::Params(bsp);
8630       } else if (val == "explicit_hash")
8631         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8632       else if (val == "explicit_object")
8633         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8634       else {
8635         ss << "unrecognized hit_set type '" << val << "'";
8636         return -EINVAL;
8637       }
8638     }
8639   } else if (var == "hit_set_period") {
8640     if (interr.length()) {
8641       ss << "error parsing integer value '" << val << "': " << interr;
8642       return -EINVAL;
8643     } else if (n < 0) {
8644       ss << "hit_set_period should be non-negative";
8645       return -EINVAL;
8646     }
8647     p.hit_set_period = n;
8648   } else if (var == "hit_set_count") {
8649     if (interr.length()) {
8650       ss << "error parsing integer value '" << val << "': " << interr;
8651       return -EINVAL;
8652     } else if (n < 0) {
8653       ss << "hit_set_count should be non-negative";
8654       return -EINVAL;
8655     }
8656     p.hit_set_count = n;
8657   } else if (var == "hit_set_fpp") {
8658     if (floaterr.length()) {
8659       ss << "error parsing floating point value '" << val << "': " << floaterr;
8660       return -EINVAL;
8661     } else if (f < 0 || f > 1.0) {
8662       ss << "hit_set_fpp should be in the range 0..1";
8663       return -EINVAL;
8664     }
8665     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8666       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8667       return -EINVAL;
8668     }
8669     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8670     bloomp->set_fpp(f);
8671   } else if (var == "use_gmt_hitset") {
8672     if (val == "true" || (interr.empty() && n == 1)) {
8673       p.use_gmt_hitset = true;
8674     } else {
8675       ss << "expecting value 'true' or '1'";
8676       return -EINVAL;
8677     }
8678   } else if (var == "allow_ec_overwrites") {
8679     if (!p.is_erasure()) {
8680       ss << "ec overwrites can only be enabled for an erasure coded pool";
8681       return -EINVAL;
8682     }
8683     stringstream err;
8684     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8685         !is_pool_currently_all_bluestore(pool, p, &err)) {
8686       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8687       return -EINVAL;
8688     }
8689     if (val == "true" || (interr.empty() && n == 1)) {
8690         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8691     } else if (val == "false" || (interr.empty() && n == 0)) {
8692       ss << "ec overwrites cannot be disabled once enabled";
8693       return -EINVAL;
8694     } else {
8695       ss << "expecting value 'true', 'false', '0', or '1'";
8696       return -EINVAL;
8697     }
8698   } else if (var == "target_max_objects") {
8699     if (interr.length()) {
8700       ss << "error parsing int '" << val << "': " << interr;
8701       return -EINVAL;
8702     }
8703     p.target_max_objects = n;
8704   } else if (var == "target_max_bytes") {
8705     if (interr.length()) {
8706       ss << "error parsing int '" << val << "': " << interr;
8707       return -EINVAL;
8708     }
8709     p.target_max_bytes = n;
8710   } else if (var == "cache_target_dirty_ratio") {
8711     if (floaterr.length()) {
8712       ss << "error parsing float '" << val << "': " << floaterr;
8713       return -EINVAL;
8714     }
8715     if (f < 0 || f > 1.0) {
8716       ss << "value must be in the range 0..1";
8717       return -ERANGE;
8718     }
8719     p.cache_target_dirty_ratio_micro = uf;
8720   } else if (var == "cache_target_dirty_high_ratio") {
8721     if (floaterr.length()) {
8722       ss << "error parsing float '" << val << "': " << floaterr;
8723       return -EINVAL;
8724     }
8725     if (f < 0 || f > 1.0) {
8726       ss << "value must be in the range 0..1";
8727       return -ERANGE;
8728     }
8729     p.cache_target_dirty_high_ratio_micro = uf;
8730   } else if (var == "cache_target_full_ratio") {
8731     if (floaterr.length()) {
8732       ss << "error parsing float '" << val << "': " << floaterr;
8733       return -EINVAL;
8734     }
8735     if (f < 0 || f > 1.0) {
8736       ss << "value must be in the range 0..1";
8737       return -ERANGE;
8738     }
8739     p.cache_target_full_ratio_micro = uf;
8740   } else if (var == "cache_min_flush_age") {
8741     if (interr.length()) {
8742       ss << "error parsing int '" << val << "': " << interr;
8743       return -EINVAL;
8744     }
8745     p.cache_min_flush_age = n;
8746   } else if (var == "cache_min_evict_age") {
8747     if (interr.length()) {
8748       ss << "error parsing int '" << val << "': " << interr;
8749       return -EINVAL;
8750     }
8751     p.cache_min_evict_age = n;
8752   } else if (var == "min_read_recency_for_promote") {
8753     if (interr.length()) {
8754       ss << "error parsing integer value '" << val << "': " << interr;
8755       return -EINVAL;
8756     }
8757     p.min_read_recency_for_promote = n;
8758   } else if (var == "hit_set_grade_decay_rate") {
8759     if (interr.length()) {
8760       ss << "error parsing integer value '" << val << "': " << interr;
8761       return -EINVAL;
8762     }
8763     if (n > 100 || n < 0) {
8764       ss << "value out of range,valid range is 0 - 100";
8765       return -EINVAL;
8766     }
8767     p.hit_set_grade_decay_rate = n;
8768   } else if (var == "hit_set_search_last_n") {
8769     if (interr.length()) {
8770       ss << "error parsing integer value '" << val << "': " << interr;
8771       return -EINVAL;
8772     }
8773     if (n > p.hit_set_count || n < 0) {
8774       ss << "value out of range,valid range is 0 - hit_set_count";
8775       return -EINVAL;
8776     }
8777     p.hit_set_search_last_n = n;
8778   } else if (var == "min_write_recency_for_promote") {
8779     if (interr.length()) {
8780       ss << "error parsing integer value '" << val << "': " << interr;
8781       return -EINVAL;
8782     }
8783     p.min_write_recency_for_promote = n;
8784   } else if (var == "fast_read") {
8785     if (p.is_replicated()) {
8786         ss << "fast read is not supported in replication pool";
8787         return -EINVAL;
8788     }
8789     if (val == "true" || (interr.empty() && n == 1)) {
8790       p.fast_read = true;
8791     } else if (val == "false" || (interr.empty() && n == 0)) {
8792       p.fast_read = false;
8793     } else {
8794       ss << "expecting value 'true', 'false', '0', or '1'";
8795       return -EINVAL;
8796     }
8797   } else if (pool_opts_t::is_opt_name(var)) {
8798     bool unset = val == "unset";
8799     if (var == "compression_mode") {
8800       if (!unset) {
8801         auto cmode = Compressor::get_comp_mode_type(val);
8802         if (!cmode) {
8803           ss << "unrecognized compression mode '" << val << "'";
8804           return -EINVAL;
8805         }
8806       }
8807     } else if (var == "compression_algorithm") {
8808       if (!unset) {
8809         auto alg = Compressor::get_comp_alg_type(val);
8810         if (!alg) {
8811           ss << "unrecognized compression_algorithm '" << val << "'";
8812           return -EINVAL;
8813         }
8814       }
8815     } else if (var == "compression_required_ratio") {
8816       if (floaterr.length()) {
8817         ss << "error parsing float value '" << val << "': " << floaterr;
8818         return -EINVAL;
8819       }
8820       if (f < 0 || f > 1) {
8821         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8822         return -EINVAL;
8823       }
8824     } else if (var == "csum_type") {
8825       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8826       if (t < 0 ) {
8827         ss << "unrecognized csum_type '" << val << "'";
8828         return -EINVAL;
8829       }
8830       //preserve csum_type numeric value
8831       n = t;
8832       interr.clear();
8833     } else if (var == "compression_max_blob_size" ||
8834                var == "compression_min_blob_size" ||
8835                var == "csum_max_block" ||
8836                var == "csum_min_block") {
8837       if (interr.length()) {
8838         ss << "error parsing int value '" << val << "': " << interr;
8839         return -EINVAL;
8840       }
8841     } else if (var == "fingerprint_algorithm") {
8842       if (!unset) {
8843         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8844         if (!alg) {
8845           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8846           return -EINVAL;
8847         }
8848       }
8849     } else if (var == "target_size_bytes") {
8850       if (interr.length()) {
8851         ss << "error parsing unit value '" << val << "': " << interr;
8852         return -EINVAL;
8853       }
8854       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8855         ss << "must set require_osd_release to nautilus or "
8856            << "later before setting target_size_bytes";
8857         return -EINVAL;
8858       }
8859     } else if (var == "target_size_ratio") {
8860       if (f < 0.0) {
8861         ss << "target_size_ratio cannot be negative";
8862         return -EINVAL;
8863       }
8864     } else if (var == "pg_num_min") {
8865       if (interr.length()) {
8866         ss << "error parsing int value '" << val << "': " << interr;
8867         return -EINVAL;
8868       }
8869       if (n > (int)p.get_pg_num_target()) {
8870         ss << "specified pg_num_min " << n
8871            << " > pg_num " << p.get_pg_num_target();
8872         return -EINVAL;
8873       }
8874     } else if (var == "pg_num_max") {
8875       if (interr.length()) {
8876         ss << "error parsing int value '" << val << "': " << interr;
8877         return -EINVAL;
8878       }
8879       if (n && n < (int)p.get_pg_num_target()) {
8880         ss << "specified pg_num_max " << n
8881            << " < pg_num " << p.get_pg_num_target();
8882         return -EINVAL;
8883       }
8884     } else if (var == "recovery_priority") {
8885       if (interr.length()) {
8886         ss << "error parsing int value '" << val << "': " << interr;
8887         return -EINVAL;
8888       }
8889       if (!g_conf()->debug_allow_any_pool_priority) {
8890         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8891           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8892              << " and " << OSD_POOL_PRIORITY_MAX;
8893           return -EINVAL;
8894         }
8895       }
8896     } else if (var == "pg_autoscale_bias") {
8897       if (f < 0.0 || f > 1000.0) {
8898         ss << "pg_autoscale_bias must be between 0 and 1000";
8899         return -EINVAL;
8900       }
8901     } else if (var == "dedup_tier") {
8902       if (interr.empty()) {
8903         ss << "expecting value 'pool name'";
8904         return -EINVAL;
8905       }
8906       // Current base tier in dedup does not support ec pool
8907       if (p.is_erasure()) {
8908         ss << "pool '" << poolstr
8909            << "' is an ec pool, which cannot be a base tier";
8910         return -ENOTSUP;
8911       }
8912       int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8913       if (lowtierpool_id < 0) {
8914         ss << "unrecognized pool '" << val << "'";
8915         return -ENOENT;
8916       }
8917       const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8918       ceph_assert(tp);
8919       n = lowtierpool_id;
8920       // The original input is string (pool name), but we convert it to int64_t.
8921       // So, clear interr
8922       interr.clear();
8923     } else if (var == "dedup_chunk_algorithm") {
8924       if (!unset) {
8925         auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8926         if (!alg) {
8927           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8928           return -EINVAL;
8929         }
8930       }
8931     } else if (var == "dedup_cdc_chunk_size") {
8932       if (interr.length()) {
8933         ss << "error parsing int value '" << val << "': " << interr;
8934         return -EINVAL;
8935       }
8936     }
8937
8938     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8939     switch (desc.type) {
8940     case pool_opts_t::STR:
8941       if (unset) {
8942         p.opts.unset(desc.key);
8943       } else {
8944         p.opts.set(desc.key, static_cast<std::string>(val));
8945       }
8946       break;
8947     case pool_opts_t::INT:
8948       if (interr.length()) {
8949         ss << "error parsing integer value '" << val << "': " << interr;
8950         return -EINVAL;
8951       }
8952       if (n == 0) {
8953         p.opts.unset(desc.key);
8954       } else {
8955         p.opts.set(desc.key, static_cast<int64_t>(n));
8956       }
8957       break;
8958     case pool_opts_t::DOUBLE:
8959       if (floaterr.length()) {
8960         ss << "error parsing floating point value '" << val << "': " << floaterr;
8961         return -EINVAL;
8962       }
8963       if (f == 0) {
8964         p.opts.unset(desc.key);
8965       } else {
8966         p.opts.set(desc.key, static_cast<double>(f));
8967       }
8968       break;
8969     default:
8970       ceph_assert(!"unknown type");
8971     }
8972   } else {
8973     ss << "unrecognized variable '" << var << "'";
8974     return -EINVAL;
8975   }
8976   if (val != "unset") {
8977     ss << "set pool " << pool << " " << var << " to " << val;
8978   } else {
8979     ss << "unset pool " << pool << " " << var;
8980   }
8981   p.last_change = pending_inc.epoch;
8982   pending_inc.new_pools[pool] = p;
8983   return 0;
8984 }
8985
8986 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8987                                                  const cmdmap_t& cmdmap,
8988                                                  stringstream& ss)
8989 {
8990   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8991 }
8992
8993 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8994                                                     const cmdmap_t& cmdmap,
8995                                                     stringstream& ss,
8996                                                     bool *modified)
8997 {
8998   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8999 }
9000
9001
9002 /**
9003  * Common logic for preprocess and prepare phases of pool application
9004  * tag commands.  In preprocess mode we're only detecting invalid
9005  * commands, and determining whether it was a modification or a no-op.
9006  * In prepare mode we're actually updating the pending state.
9007  */
9008 int OSDMonitor::_command_pool_application(const string &prefix,
9009                                           const cmdmap_t& cmdmap,
9010                                           stringstream& ss,
9011                                           bool *modified,
9012                                           bool preparing)
9013 {
9014   string pool_name;
9015   cmd_getval(cmdmap, "pool", pool_name);
9016   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
9017   if (pool < 0) {
9018     ss << "unrecognized pool '" << pool_name << "'";
9019     return -ENOENT;
9020   }
9021
9022   pg_pool_t p = *osdmap.get_pg_pool(pool);
9023   if (preparing) {
9024     if (pending_inc.new_pools.count(pool)) {
9025       p = pending_inc.new_pools[pool];
9026     }
9027   }
9028
9029   string app;
9030   cmd_getval(cmdmap, "app", app);
9031   bool app_exists = (p.application_metadata.count(app) > 0);
9032
9033   string key;
9034   cmd_getval(cmdmap, "key", key);
9035   if (key == "all") {
9036     ss << "key cannot be 'all'";
9037     return -EINVAL;
9038   }
9039
9040   string value;
9041   cmd_getval(cmdmap, "value", value);
9042   if (value == "all") {
9043     ss << "value cannot be 'all'";
9044     return -EINVAL;
9045   }
9046
9047   if (boost::algorithm::ends_with(prefix, "enable")) {
9048     if (app.empty()) {
9049       ss << "application name must be provided";
9050       return -EINVAL;
9051     }
9052
9053     if (p.is_tier()) {
9054       ss << "application must be enabled on base tier";
9055       return -EINVAL;
9056     }
9057
9058     bool force = false;
9059     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9060
9061     if (!app_exists && !p.application_metadata.empty() && !force) {
9062       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
9063          << "application; pass --yes-i-really-mean-it to proceed anyway";
9064       return -EPERM;
9065     }
9066
9067     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
9068       ss << "too many enabled applications on pool '" << pool_name << "'; "
9069          << "max " << MAX_POOL_APPLICATIONS;
9070       return -EINVAL;
9071     }
9072
9073     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
9074       ss << "application name '" << app << "' too long; max length "
9075          << MAX_POOL_APPLICATION_LENGTH;
9076       return -EINVAL;
9077     }
9078
9079     if (!app_exists) {
9080       p.application_metadata[app] = {};
9081     }
9082     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
9083
9084   } else if (boost::algorithm::ends_with(prefix, "disable")) {
9085     bool force = false;
9086     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
9087
9088     if (!force) {
9089       ss << "Are you SURE? Disabling an application within a pool might result "
9090          << "in loss of application functionality; pass "
9091          << "--yes-i-really-mean-it to proceed anyway";
9092       return -EPERM;
9093     }
9094
9095     if (!app_exists) {
9096       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9097          << "'";
9098       return 0; // idempotent
9099     }
9100
9101     p.application_metadata.erase(app);
9102     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
9103
9104   } else if (boost::algorithm::ends_with(prefix, "set")) {
9105     if (p.is_tier()) {
9106       ss << "application metadata must be set on base tier";
9107       return -EINVAL;
9108     }
9109
9110     if (!app_exists) {
9111       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9112          << "'";
9113       return -ENOENT;
9114     }
9115
9116     string key;
9117     cmd_getval(cmdmap, "key", key);
9118
9119     if (key.empty()) {
9120       ss << "key must be provided";
9121       return -EINVAL;
9122     }
9123
9124     auto &app_keys = p.application_metadata[app];
9125     if (app_keys.count(key) == 0 &&
9126         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
9127       ss << "too many keys set for application '" << app << "' on pool '"
9128          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
9129       return -EINVAL;
9130     }
9131
9132     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
9133       ss << "key '" << app << "' too long; max length "
9134          << MAX_POOL_APPLICATION_LENGTH;
9135       return -EINVAL;
9136     }
9137
9138     string value;
9139     cmd_getval(cmdmap, "value", value);
9140     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
9141       ss << "value '" << value << "' too long; max length "
9142          << MAX_POOL_APPLICATION_LENGTH;
9143       return -EINVAL;
9144     }
9145
9146     p.application_metadata[app][key] = value;
9147     ss << "set application '" << app << "' key '" << key << "' to '"
9148        << value << "' on pool '" << pool_name << "'";
9149   } else if (boost::algorithm::ends_with(prefix, "rm")) {
9150     if (!app_exists) {
9151       ss << "application '" << app << "' is not enabled on pool '" << pool_name
9152          << "'";
9153       return -ENOENT;
9154     }
9155
9156     string key;
9157     cmd_getval(cmdmap, "key", key);
9158     auto it = p.application_metadata[app].find(key);
9159     if (it == p.application_metadata[app].end()) {
9160       ss << "application '" << app << "' on pool '" << pool_name
9161          << "' does not have key '" << key << "'";
9162       return 0; // idempotent
9163     }
9164
9165     p.application_metadata[app].erase(it);
9166     ss << "removed application '" << app << "' key '" << key << "' on pool '"
9167        << pool_name << "'";
9168   } else {
9169     ceph_abort();
9170   }
9171
9172   if (preparing) {
9173     p.last_change = pending_inc.epoch;
9174     pending_inc.new_pools[pool] = p;
9175   }
9176
9177   // Because we fell through this far, we didn't hit no-op cases,
9178   // so pool was definitely modified
9179   if (modified != nullptr) {
9180     *modified = true;
9181   }
9182
9183   return 0;
9184 }
9185
9186 int OSDMonitor::_prepare_command_osd_crush_remove(
9187     CrushWrapper &newcrush,
9188     int32_t id,
9189     int32_t ancestor,
9190     bool has_ancestor,
9191     bool unlink_only)
9192 {
9193   int err = 0;
9194
9195   if (has_ancestor) {
9196     err = newcrush.remove_item_under(cct, id, ancestor,
9197         unlink_only);
9198   } else {
9199     err = newcrush.remove_item(cct, id, unlink_only);
9200   }
9201   return err;
9202 }
9203
9204 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
9205 {
9206   pending_inc.crush.clear();
9207   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9208 }
9209
9210 int OSDMonitor::prepare_command_osd_crush_remove(
9211     CrushWrapper &newcrush,
9212     int32_t id,
9213     int32_t ancestor,
9214     bool has_ancestor,
9215     bool unlink_only)
9216 {
9217   int err = _prepare_command_osd_crush_remove(
9218       newcrush, id, ancestor,
9219       has_ancestor, unlink_only);
9220
9221   if (err < 0)
9222     return err;
9223
9224   ceph_assert(err == 0);
9225   do_osd_crush_remove(newcrush);
9226
9227   return 0;
9228 }
9229
9230 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9231 {
9232   if (osdmap.is_up(id)) {
9233     return -EBUSY;
9234   }
9235
9236   pending_inc.new_state[id] = osdmap.get_state(id);
9237   pending_inc.new_uuid[id] = uuid_d();
9238   pending_metadata_rm.insert(id);
9239   pending_metadata.erase(id);
9240
9241   return 0;
9242 }
9243
9244 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9245 {
9246   ceph_assert(existing_id);
9247   *existing_id = -1;
9248
9249   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9250     if (!osdmap.exists(i) &&
9251         pending_inc.new_up_client.count(i) == 0 &&
9252         (pending_inc.new_state.count(i) == 0 ||
9253          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9254       *existing_id = i;
9255       return -1;
9256     }
9257   }
9258
9259   if (pending_inc.new_max_osd < 0) {
9260     return osdmap.get_max_osd();
9261   }
9262   return pending_inc.new_max_osd;
9263 }
9264
9265 void OSDMonitor::do_osd_create(
9266     const int32_t id,
9267     const uuid_d& uuid,
9268     const string& device_class,
9269     int32_t* new_id)
9270 {
9271   dout(10) << __func__ << " uuid " << uuid << dendl;
9272   ceph_assert(new_id);
9273
9274   // We presume validation has been performed prior to calling this
9275   // function. We assert with prejudice.
9276
9277   int32_t allocated_id = -1; // declare here so we can jump
9278   int32_t existing_id = -1;
9279   if (!uuid.is_zero()) {
9280     existing_id = osdmap.identify_osd(uuid);
9281     if (existing_id >= 0) {
9282       ceph_assert(id < 0 || id == existing_id);
9283       *new_id = existing_id;
9284       goto out;
9285     } else if (id >= 0) {
9286       // uuid does not exist, and id has been provided, so just create
9287       // the new osd.id
9288       *new_id = id;
9289       goto out;
9290     }
9291   }
9292
9293   // allocate a new id
9294   allocated_id = _allocate_osd_id(&existing_id);
9295   dout(10) << __func__ << " allocated id " << allocated_id
9296            << " existing id " << existing_id << dendl;
9297   if (existing_id >= 0) {
9298     ceph_assert(existing_id < osdmap.get_max_osd());
9299     ceph_assert(allocated_id < 0);
9300     *new_id = existing_id;
9301   } else if (allocated_id >= 0) {
9302     ceph_assert(existing_id < 0);
9303     // raise max_osd
9304     if (pending_inc.new_max_osd < 0) {
9305       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9306     } else {
9307       ++pending_inc.new_max_osd;
9308     }
9309     *new_id = pending_inc.new_max_osd - 1;
9310     ceph_assert(*new_id == allocated_id);
9311   } else {
9312     ceph_abort_msg("unexpected condition");
9313   }
9314
9315 out:
9316   if (device_class.size()) {
9317     CrushWrapper newcrush = _get_pending_crush();
9318     if (newcrush.get_max_devices() < *new_id + 1) {
9319       newcrush.set_max_devices(*new_id + 1);
9320     }
9321     string name = string("osd.") + stringify(*new_id);
9322     if (!newcrush.item_exists(*new_id)) {
9323       newcrush.set_item_name(*new_id, name);
9324     }
9325     ostringstream ss;
9326     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9327     if (r < 0) {
9328       derr << __func__ << " failed to set " << name << " device_class "
9329            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9330            << dendl;
9331       // non-fatal... this might be a replay and we want to be idempotent.
9332     } else {
9333       dout(20) << __func__ << " set " << name << " device_class " << device_class
9334                << dendl;
9335       pending_inc.crush.clear();
9336       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9337     }
9338   } else {
9339     dout(20) << __func__ << " no device_class" << dendl;
9340   }
9341
9342   dout(10) << __func__ << " using id " << *new_id << dendl;
9343   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9344     pending_inc.new_max_osd = *new_id + 1;
9345   }
9346
9347   pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9348   // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9349   // set it for us.  (ugh.)
9350   pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9351   if (!uuid.is_zero())
9352     pending_inc.new_uuid[*new_id] = uuid;
9353 }
9354
9355 int OSDMonitor::validate_osd_create(
9356     const int32_t id,
9357     const uuid_d& uuid,
9358     const bool check_osd_exists,
9359     int32_t* existing_id,
9360     stringstream& ss)
9361 {
9362
9363   dout(10) << __func__ << " id " << id << " uuid " << uuid
9364            << " check_osd_exists " << check_osd_exists << dendl;
9365
9366   ceph_assert(existing_id);
9367
9368   if (id < 0 && uuid.is_zero()) {
9369     // we have nothing to validate
9370     *existing_id = -1;
9371     return 0;
9372   } else if (uuid.is_zero()) {
9373     // we have an id but we will ignore it - because that's what
9374     // `osd create` does.
9375     return 0;
9376   }
9377
9378   /*
9379    * This function will be used to validate whether we are able to
9380    * create a new osd when the `uuid` is specified.
9381    *
9382    * It will be used by both `osd create` and `osd new`, as the checks
9383    * are basically the same when it pertains to osd id and uuid validation.
9384    * However, `osd create` presumes an `uuid` is optional, for legacy
9385    * reasons, while `osd new` requires the `uuid` to be provided. This
9386    * means that `osd create` will not be idempotent if an `uuid` is not
9387    * provided, but we will always guarantee the idempotency of `osd new`.
9388    */
9389
9390   ceph_assert(!uuid.is_zero());
9391   if (pending_inc.identify_osd(uuid) >= 0) {
9392     // osd is about to exist
9393     return -EAGAIN;
9394   }
9395
9396   int32_t i = osdmap.identify_osd(uuid);
9397   if (i >= 0) {
9398     // osd already exists
9399     if (id >= 0 && i != id) {
9400       ss << "uuid " << uuid << " already in use for different id " << i;
9401       return -EEXIST;
9402     }
9403     // return a positive errno to distinguish between a blocking error
9404     // and an error we consider to not be a problem (i.e., this would be
9405     // an idempotent operation).
9406     *existing_id = i;
9407     return EEXIST;
9408   }
9409   // i < 0
9410   if (id >= 0) {
9411     if (pending_inc.new_state.count(id)) {
9412       // osd is about to exist
9413       return -EAGAIN;
9414     }
9415     // we may not care if an osd exists if we are recreating a previously
9416     // destroyed osd.
9417     if (check_osd_exists && osdmap.exists(id)) {
9418       ss << "id " << id << " already in use and does not match uuid "
9419          << uuid;
9420       return -EINVAL;
9421     }
9422   }
9423   return 0;
9424 }
9425
9426 int OSDMonitor::prepare_command_osd_create(
9427     const int32_t id,
9428     const uuid_d& uuid,
9429     int32_t* existing_id,
9430     stringstream& ss)
9431 {
9432   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9433   ceph_assert(existing_id);
9434   if (osdmap.is_destroyed(id)) {
9435     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9436           "instead.";
9437     return -EINVAL;
9438   }
9439
9440   if (uuid.is_zero()) {
9441     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9442   }
9443
9444   return validate_osd_create(id, uuid, true, existing_id, ss);
9445 }
9446
9447 int OSDMonitor::prepare_command_osd_new(
9448     MonOpRequestRef op,
9449     const cmdmap_t& cmdmap,
9450     const map<string,string>& params,
9451     stringstream &ss,
9452     Formatter *f)
9453 {
9454   uuid_d uuid;
9455   string uuidstr;
9456   int64_t id = -1;
9457
9458   ceph_assert(paxos.is_plugged());
9459
9460   dout(10) << __func__ << " " << op << dendl;
9461
9462   /* validate command. abort now if something's wrong. */
9463
9464   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9465    *
9466    * If `id` is not specified, we will identify any existing osd based
9467    * on `uuid`. Operation will be idempotent iff secrets match.
9468    *
9469    * If `id` is specified, we will identify any existing osd based on
9470    * `uuid` and match against `id`. If they match, operation will be
9471    * idempotent iff secrets match.
9472    *
9473    * `-i secrets.json` will be optional. If supplied, will be used
9474    * to check for idempotency when `id` and `uuid` match.
9475    *
9476    * If `id` is not specified, and `uuid` does not exist, an id will
9477    * be found or allocated for the osd.
9478    *
9479    * If `id` is specified, and the osd has been previously marked
9480    * as destroyed, then the `id` will be reused.
9481    */
9482   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9483     ss << "requires the OSD's UUID to be specified.";
9484     return -EINVAL;
9485   } else if (!uuid.parse(uuidstr.c_str())) {
9486     ss << "invalid UUID value '" << uuidstr << "'.";
9487     return -EINVAL;
9488   }
9489
9490   if (cmd_getval(cmdmap, "id", id) &&
9491       (id < 0)) {
9492     ss << "invalid OSD id; must be greater or equal than zero.";
9493     return -EINVAL;
9494   }
9495
9496   // are we running an `osd create`-like command, or recreating
9497   // a previously destroyed osd?
9498
9499   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9500
9501   // we will care about `id` to assess whether osd is `destroyed`, or
9502   // to create a new osd.
9503   // we will need an `id` by the time we reach auth.
9504
9505   int32_t existing_id = -1;
9506   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9507                                 &existing_id, ss);
9508
9509   bool may_be_idempotent = false;
9510   if (err == EEXIST) {
9511     // this is idempotent from the osdmon's point-of-view
9512     may_be_idempotent = true;
9513     ceph_assert(existing_id >= 0);
9514     id = existing_id;
9515   } else if (err < 0) {
9516     return err;
9517   }
9518
9519   if (!may_be_idempotent) {
9520     // idempotency is out of the window. We are either creating a new
9521     // osd or recreating a destroyed osd.
9522     //
9523     // We now need to figure out if we have an `id` (and if it's valid),
9524     // of find an `id` if we don't have one.
9525
9526     // NOTE: we need to consider the case where the `id` is specified for
9527     // `osd create`, and we must honor it. So this means checking if
9528     // the `id` is destroyed, and if so assume the destroy; otherwise,
9529     // check if it `exists` - in which case we complain about not being
9530     // `destroyed`. In the end, if nothing fails, we must allow the
9531     // creation, so that we are compatible with `create`.
9532     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9533       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9534       ss << "OSD " << id << " has not yet been destroyed";
9535       return -EINVAL;
9536     } else if (id < 0) {
9537       // find an `id`
9538       id = _allocate_osd_id(&existing_id);
9539       if (id < 0) {
9540         ceph_assert(existing_id >= 0);
9541         id = existing_id;
9542       }
9543       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9544     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9545       dout(10) << __func__ << " recreating osd." << id << dendl;
9546     } else {
9547       dout(10) << __func__ << " creating new osd." << id << dendl;
9548     }
9549   } else {
9550     ceph_assert(id >= 0);
9551     ceph_assert(osdmap.exists(id));
9552   }
9553
9554   // we are now able to either create a brand new osd or reuse an existing
9555   // osd that has been previously destroyed.
9556
9557   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9558
9559   if (may_be_idempotent && params.empty()) {
9560     // nothing to do, really.
9561     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9562     ceph_assert(id >= 0);
9563     if (f) {
9564       f->open_object_section("created_osd");
9565       f->dump_int("osdid", id);
9566       f->close_section();
9567     } else {
9568       ss << id;
9569     }
9570     return EEXIST;
9571   }
9572
9573   string device_class;
9574   auto p = params.find("crush_device_class");
9575   if (p != params.end()) {
9576     device_class = p->second;
9577     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9578   }
9579   string cephx_secret, lockbox_secret, dmcrypt_key;
9580   bool has_lockbox = false;
9581   bool has_secrets = params.count("cephx_secret")
9582     || params.count("cephx_lockbox_secret")
9583     || params.count("dmcrypt_key");
9584
9585   KVMonitor *svc = nullptr;
9586   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9587
9588   if (has_secrets) {
9589     if (params.count("cephx_secret") == 0) {
9590       ss << "requires a cephx secret.";
9591       return -EINVAL;
9592     }
9593     cephx_secret = params.at("cephx_secret");
9594
9595     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9596     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9597
9598     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9599              << " dmcrypt " << has_dmcrypt_key << dendl;
9600
9601     if (has_lockbox_secret && has_dmcrypt_key) {
9602       has_lockbox = true;
9603       lockbox_secret = params.at("cephx_lockbox_secret");
9604       dmcrypt_key = params.at("dmcrypt_key");
9605     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9606       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9607       return -EINVAL;
9608     }
9609
9610     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9611
9612     err = mon.authmon()->validate_osd_new(id, uuid,
9613         cephx_secret,
9614         lockbox_secret,
9615         cephx_entity,
9616         lockbox_entity,
9617         ss);
9618     if (err < 0) {
9619       return err;
9620     } else if (may_be_idempotent && err != EEXIST) {
9621       // for this to be idempotent, `id` should already be >= 0; no need
9622       // to use validate_id.
9623       ceph_assert(id >= 0);
9624       ss << "osd." << id << " exists but secrets do not match";
9625       return -EEXIST;
9626     }
9627
9628     if (has_lockbox) {
9629       svc = mon.kvmon();
9630       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9631       if (err < 0) {
9632         return err;
9633       } else if (may_be_idempotent && err != EEXIST) {
9634         ceph_assert(id >= 0);
9635         ss << "osd." << id << " exists but dm-crypt key does not match.";
9636         return -EEXIST;
9637       }
9638     }
9639   }
9640   ceph_assert(!has_secrets || !cephx_secret.empty());
9641   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9642
9643   if (may_be_idempotent) {
9644     // we have nothing to do for either the osdmon or the authmon,
9645     // and we have no lockbox - so the config key service will not be
9646     // touched. This is therefore an idempotent operation, and we can
9647     // just return right away.
9648     dout(10) << __func__ << " idempotent -- no op." << dendl;
9649     ceph_assert(id >= 0);
9650     if (f) {
9651       f->open_object_section("created_osd");
9652       f->dump_int("osdid", id);
9653       f->close_section();
9654     } else {
9655       ss << id;
9656     }
9657     return EEXIST;
9658   }
9659   ceph_assert(!may_be_idempotent);
9660
9661   // perform updates.
9662   if (has_secrets) {
9663     ceph_assert(!cephx_secret.empty());
9664     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9665            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9666
9667     err = mon.authmon()->do_osd_new(cephx_entity,
9668         lockbox_entity,
9669         has_lockbox);
9670     ceph_assert(0 == err);
9671
9672     if (has_lockbox) {
9673       ceph_assert(nullptr != svc);
9674       svc->do_osd_new(uuid, dmcrypt_key);
9675     }
9676   }
9677
9678   if (is_recreate_destroyed) {
9679     ceph_assert(id >= 0);
9680     ceph_assert(osdmap.is_destroyed(id));
9681     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9682     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9683       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9684     }
9685     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9686       // due to http://tracker.ceph.com/issues/20751 some clusters may
9687       // have UP set for non-existent OSDs; make sure it is cleared
9688       // for a newly created osd.
9689       pending_inc.new_state[id] |= CEPH_OSD_UP;
9690     }
9691     pending_inc.new_uuid[id] = uuid;
9692   } else {
9693     ceph_assert(id >= 0);
9694     int32_t new_id = -1;
9695     do_osd_create(id, uuid, device_class, &new_id);
9696     ceph_assert(new_id >= 0);
9697     ceph_assert(id == new_id);
9698   }
9699
9700   if (f) {
9701     f->open_object_section("created_osd");
9702     f->dump_int("osdid", id);
9703     f->close_section();
9704   } else {
9705     ss << id;
9706   }
9707
9708   return 0;
9709 }
9710
9711 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9712 {
9713   op->mark_osdmon_event(__func__);
9714   auto m = op->get_req<MMonCommand>();
9715   stringstream ss;
9716   cmdmap_t cmdmap;
9717   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9718     string rs = ss.str();
9719     mon.reply_command(op, -EINVAL, rs, get_last_committed());
9720     return true;
9721   }
9722
9723   MonSession *session = op->get_session();
9724   if (!session) {
9725     derr << __func__ << " no session" << dendl;
9726     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9727     return true;
9728   }
9729
9730   return prepare_command_impl(op, cmdmap);
9731 }
9732
9733 static int parse_reweights(CephContext *cct,
9734                            const cmdmap_t& cmdmap,
9735                            const OSDMap& osdmap,
9736                            map<int32_t, uint32_t>* weights)
9737 {
9738   string weights_str;
9739   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9740     return -EINVAL;
9741   }
9742   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9743   json_spirit::mValue json_value;
9744   if (!json_spirit::read(weights_str, json_value)) {
9745     return -EINVAL;
9746   }
9747   if (json_value.type() != json_spirit::obj_type) {
9748     return -EINVAL;
9749   }
9750   const auto obj = json_value.get_obj();
9751   try {
9752     for (auto& osd_weight : obj) {
9753       auto osd_id = std::stoi(osd_weight.first);
9754       if (!osdmap.exists(osd_id)) {
9755         return -ENOENT;
9756       }
9757       if (osd_weight.second.type() != json_spirit::str_type) {
9758         return -EINVAL;
9759       }
9760       auto weight = std::stoul(osd_weight.second.get_str());
9761       weights->insert({osd_id, weight});
9762     }
9763   } catch (const std::logic_error& e) {
9764     return -EINVAL;
9765   }
9766   return 0;
9767 }
9768
9769 int OSDMonitor::prepare_command_osd_destroy(
9770     int32_t id,
9771     stringstream& ss)
9772 {
9773   ceph_assert(paxos.is_plugged());
9774
9775   // we check if the osd exists for the benefit of `osd purge`, which may
9776   // have previously removed the osd. If the osd does not exist, return
9777   // -ENOENT to convey this, and let the caller deal with it.
9778   //
9779   // we presume that all auth secrets and config keys were removed prior
9780   // to this command being called. if they exist by now, we also assume
9781   // they must have been created by some other command and do not pertain
9782   // to this non-existent osd.
9783   if (!osdmap.exists(id)) {
9784     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9785     return -ENOENT;
9786   }
9787
9788   uuid_d uuid = osdmap.get_uuid(id);
9789   dout(10) << __func__ << " destroying osd." << id
9790            << " uuid " << uuid << dendl;
9791
9792   // if it has been destroyed, we assume our work here is done.
9793   if (osdmap.is_destroyed(id)) {
9794     ss << "destroyed osd." << id;
9795     return 0;
9796   }
9797
9798   EntityName cephx_entity, lockbox_entity;
9799   bool idempotent_auth = false, idempotent_cks = false;
9800
9801   int err = mon.authmon()->validate_osd_destroy(id, uuid,
9802                                                  cephx_entity,
9803                                                  lockbox_entity,
9804                                                  ss);
9805   if (err < 0) {
9806     if (err == -ENOENT) {
9807       idempotent_auth = true;
9808     } else {
9809       return err;
9810     }
9811   }
9812
9813   auto svc = mon.kvmon();
9814   err = svc->validate_osd_destroy(id, uuid);
9815   if (err < 0) {
9816     ceph_assert(err == -ENOENT);
9817     err = 0;
9818     idempotent_cks = true;
9819   }
9820
9821   if (!idempotent_auth) {
9822     err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9823     ceph_assert(0 == err);
9824   }
9825
9826   if (!idempotent_cks) {
9827     svc->do_osd_destroy(id, uuid);
9828   }
9829
9830   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9831   pending_inc.new_uuid[id] = uuid_d();
9832
9833   // we can only propose_pending() once per service, otherwise we'll be
9834   // defying PaxosService and all laws of nature. Therefore, as we may
9835   // be used during 'osd purge', let's keep the caller responsible for
9836   // proposing.
9837   ceph_assert(err == 0);
9838   return 0;
9839 }
9840
9841 int OSDMonitor::prepare_command_osd_purge(
9842     int32_t id,
9843     stringstream& ss)
9844 {
9845   ceph_assert(paxos.is_plugged());
9846   dout(10) << __func__ << " purging osd." << id << dendl;
9847
9848   ceph_assert(!osdmap.is_up(id));
9849
9850   /*
9851    * This may look a bit weird, but this is what's going to happen:
9852    *
9853    *  1. we make sure that removing from crush works
9854    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9855    *     error, then we abort the whole operation, as no updates
9856    *     have been made. However, we this function will have
9857    *     side-effects, thus we need to make sure that all operations
9858    *     performed henceforth will *always* succeed.
9859    *  3. we call `prepare_command_osd_remove()`. Although this
9860    *     function can return an error, it currently only checks if the
9861    *     osd is up - and we have made sure that it is not so, so there
9862    *     is no conflict, and it is effectively an update.
9863    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9864    *     the crush update we delayed from before.
9865    */
9866
9867   CrushWrapper newcrush = _get_pending_crush();
9868
9869   bool may_be_idempotent = false;
9870
9871   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9872   if (err == -ENOENT) {
9873     err = 0;
9874     may_be_idempotent = true;
9875   } else if (err < 0) {
9876     ss << "error removing osd." << id << " from crush";
9877     return err;
9878   }
9879
9880   // no point destroying the osd again if it has already been marked destroyed
9881   if (!osdmap.is_destroyed(id)) {
9882     err = prepare_command_osd_destroy(id, ss);
9883     if (err < 0) {
9884       if (err == -ENOENT) {
9885         err = 0;
9886       } else {
9887         return err;
9888       }
9889     } else {
9890       may_be_idempotent = false;
9891     }
9892   }
9893   ceph_assert(0 == err);
9894
9895   if (may_be_idempotent && !osdmap.exists(id)) {
9896     dout(10) << __func__ << " osd." << id << " does not exist and "
9897              << "we are idempotent." << dendl;
9898     return -ENOENT;
9899   }
9900
9901   err = prepare_command_osd_remove(id);
9902   // we should not be busy, as we should have made sure this id is not up.
9903   ceph_assert(0 == err);
9904
9905   do_osd_crush_remove(newcrush);
9906   return 0;
9907 }
9908
9909 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9910                                       const cmdmap_t& cmdmap)
9911 {
9912   op->mark_osdmon_event(__func__);
9913   auto m = op->get_req<MMonCommand>();
9914   bool ret = false;
9915   stringstream ss;
9916   string rs;
9917   bufferlist rdata;
9918   int err = 0;
9919
9920   string format = cmd_getval_or<string>(cmdmap, "format", "plain");
9921   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9922
9923   string prefix;
9924   cmd_getval(cmdmap, "prefix", prefix);
9925
9926   int64_t osdid;
9927   string osd_name;
9928   bool osdid_present = false;
9929   if (prefix != "osd pg-temp" &&
9930       prefix != "osd pg-upmap" &&
9931       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9932     osdid_present = cmd_getval(cmdmap, "id", osdid);
9933   }
9934   if (osdid_present) {
9935     ostringstream oss;
9936     oss << "osd." << osdid;
9937     osd_name = oss.str();
9938   }
9939
9940   // Even if there's a pending state with changes that could affect
9941   // a command, considering that said state isn't yet committed, we
9942   // just don't care about those changes if the command currently being
9943   // handled acts as a no-op against the current committed state.
9944   // In a nutshell, we assume this command  happens *before*.
9945   //
9946   // Let me make this clearer:
9947   //
9948   //   - If we have only one client, and that client issues some
9949   //     operation that would conflict with this operation  but is
9950   //     still on the pending state, then we would be sure that said
9951   //     operation wouldn't have returned yet, so the client wouldn't
9952   //     issue this operation (unless the client didn't wait for the
9953   //     operation to finish, and that would be the client's own fault).
9954   //
9955   //   - If we have more than one client, each client will observe
9956   //     whatever is the state at the moment of the commit.  So, if we
9957   //     have two clients, one issuing an unlink and another issuing a
9958   //     link, and if the link happens while the unlink is still on the
9959   //     pending state, from the link's point-of-view this is a no-op.
9960   //     If different clients are issuing conflicting operations and
9961   //     they care about that, then the clients should make sure they
9962   //     enforce some kind of concurrency mechanism -- from our
9963   //     perspective that's what Douglas Adams would call an SEP.
9964   //
9965   // This should be used as a general guideline for most commands handled
9966   // in this function.  Adapt as you see fit, but please bear in mind that
9967   // this is the expected behavior.
9968
9969
9970   if (prefix == "osd setcrushmap" ||
9971       (prefix == "osd crush set" && !osdid_present)) {
9972     if (pending_inc.crush.length()) {
9973       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9974       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9975       return true;
9976     }
9977     dout(10) << "prepare_command setting new crush map" << dendl;
9978     bufferlist data(m->get_data());
9979     CrushWrapper crush;
9980     try {
9981       auto bl = data.cbegin();
9982       crush.decode(bl);
9983     }
9984     catch (const std::exception &e) {
9985       err = -EINVAL;
9986       ss << "Failed to parse crushmap: " << e.what();
9987       goto reply;
9988     }
9989
9990     int64_t prior_version = 0;
9991     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9992       if (prior_version == osdmap.get_crush_version() - 1) {
9993         // see if we are a resend of the last update.  this is imperfect
9994         // (multiple racing updaters may not both get reliable success)
9995         // but we expect crush updaters (via this interface) to be rare-ish.
9996         bufferlist current, proposed;
9997         osdmap.crush->encode(current, mon.get_quorum_con_features());
9998         crush.encode(proposed, mon.get_quorum_con_features());
9999         if (current.contents_equal(proposed)) {
10000           dout(10) << __func__
10001                    << " proposed matches current and version equals previous"
10002                    << dendl;
10003           err = 0;
10004           ss << osdmap.get_crush_version();
10005           goto reply;
10006         }
10007       }
10008       if (prior_version != osdmap.get_crush_version()) {
10009         err = -EPERM;
10010         ss << "prior_version " << prior_version << " != crush version "
10011            << osdmap.get_crush_version();
10012         goto reply;
10013       }
10014     }
10015
10016     if (!validate_crush_against_features(&crush, ss)) {
10017       err = -EINVAL;
10018       goto reply;
10019     }
10020
10021     err = osdmap.validate_crush_rules(&crush, &ss);
10022     if (err < 0) {
10023       goto reply;
10024     }
10025
10026     if (g_conf()->mon_osd_crush_smoke_test) {
10027       // sanity check: test some inputs to make sure this map isn't
10028       // totally broken
10029       dout(10) << " testing map" << dendl;
10030       stringstream ess;
10031       CrushTester tester(crush, ess);
10032       tester.set_min_x(0);
10033       tester.set_max_x(50);
10034       tester.set_num_rep(3);  // arbitrary
10035       auto start = ceph::coarse_mono_clock::now();
10036       int r = tester.test_with_fork(g_conf()->mon_lease);
10037       auto duration = ceph::coarse_mono_clock::now() - start;
10038       if (r < 0) {
10039         dout(10) << " tester.test_with_fork returns " << r
10040                  << ": " << ess.str() << dendl;
10041         ss << "crush smoke test failed with " << r << ": " << ess.str();
10042         err = r;
10043         goto reply;
10044       }
10045       dout(10) << __func__ << " crush somke test duration: "
10046                << duration << ", result: " << ess.str() << dendl;
10047     }
10048
10049     pending_inc.crush = data;
10050     ss << osdmap.get_crush_version() + 1;
10051     goto update;
10052
10053   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
10054     CrushWrapper newcrush = _get_pending_crush();
10055     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
10056       int bid = -1 - b;
10057       if (newcrush.bucket_exists(bid) &&
10058           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
10059         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
10060         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
10061       }
10062     }
10063     if (!validate_crush_against_features(&newcrush, ss)) {
10064       err = -EINVAL;
10065       goto reply;
10066     }
10067     pending_inc.crush.clear();
10068     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10069     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10070                                               get_last_committed() + 1));
10071     return true;
10072   } else if (prefix == "osd crush set-device-class") {
10073     string device_class;
10074     if (!cmd_getval(cmdmap, "class", device_class)) {
10075       err = -EINVAL; // no value!
10076       goto reply;
10077     }
10078
10079     bool stop = false;
10080     vector<string> idvec;
10081     cmd_getval(cmdmap, "ids", idvec);
10082     CrushWrapper newcrush = _get_pending_crush();
10083     set<int> updated;
10084     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10085       set<int> osds;
10086       // wildcard?
10087       if (j == 0 &&
10088           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10089         osdmap.get_all_osds(osds);
10090         stop = true;
10091       } else {
10092         // try traditional single osd way
10093         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10094         if (osd < 0) {
10095           // ss has reason for failure
10096           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10097           err = -EINVAL;
10098           continue;
10099         }
10100         osds.insert(osd);
10101       }
10102
10103       for (auto &osd : osds) {
10104         if (!osdmap.exists(osd)) {
10105           ss << "osd." << osd << " does not exist. ";
10106           continue;
10107         }
10108
10109         ostringstream oss;
10110         oss << "osd." << osd;
10111         string name = oss.str();
10112
10113         if (newcrush.get_max_devices() < osd + 1) {
10114           newcrush.set_max_devices(osd + 1);
10115         }
10116         string action;
10117         if (newcrush.item_exists(osd)) {
10118           action = "updating";
10119         } else {
10120           action = "creating";
10121           newcrush.set_item_name(osd, name);
10122         }
10123
10124         dout(5) << action << " crush item id " << osd << " name '" << name
10125                 << "' device_class '" << device_class << "'"
10126                 << dendl;
10127         err = newcrush.update_device_class(osd, device_class, name, &ss);
10128         if (err < 0) {
10129           goto reply;
10130         }
10131         if (err == 0 && !_have_pending_crush()) {
10132           if (!stop) {
10133             // for single osd only, wildcard makes too much noise
10134             ss << "set-device-class item id " << osd << " name '" << name
10135                << "' device_class '" << device_class << "': no change. ";
10136           }
10137         } else {
10138           updated.insert(osd);
10139         }
10140       }
10141     }
10142
10143     pending_inc.crush.clear();
10144     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10145     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
10146     getline(ss, rs);
10147     wait_for_finished_proposal(
10148       op,
10149       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10150     return true;
10151  } else if (prefix == "osd crush rm-device-class") {
10152     bool stop = false;
10153     vector<string> idvec;
10154     cmd_getval(cmdmap, "ids", idvec);
10155     CrushWrapper newcrush = _get_pending_crush();
10156     set<int> updated;
10157
10158     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
10159       set<int> osds;
10160
10161       // wildcard?
10162       if (j == 0 &&
10163           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
10164         osdmap.get_all_osds(osds);
10165         stop = true;
10166       } else {
10167         // try traditional single osd way
10168         long osd = parse_osd_id(idvec[j].c_str(), &ss);
10169         if (osd < 0) {
10170           // ss has reason for failure
10171           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
10172           err = -EINVAL;
10173           goto reply;
10174         }
10175         osds.insert(osd);
10176       }
10177
10178       for (auto &osd : osds) {
10179         if (!osdmap.exists(osd)) {
10180           ss << "osd." << osd << " does not exist. ";
10181           continue;
10182         }
10183
10184         auto class_name = newcrush.get_item_class(osd);
10185         if (!class_name) {
10186           ss << "osd." << osd << " belongs to no class, ";
10187           continue;
10188         }
10189         // note that we do not verify if class_is_in_use here
10190         // in case the device is misclassified and user wants
10191         // to overridely reset...
10192
10193         err = newcrush.remove_device_class(cct, osd, &ss);
10194         if (err < 0) {
10195           // ss has reason for failure
10196           goto reply;
10197         }
10198         updated.insert(osd);
10199       }
10200     }
10201
10202     pending_inc.crush.clear();
10203     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10204     ss << "done removing class of osd(s): " << updated;
10205     getline(ss, rs);
10206     wait_for_finished_proposal(
10207       op,
10208       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10209     return true;
10210   } else if (prefix == "osd crush class create") {
10211     string device_class;
10212     if (!cmd_getval(cmdmap, "class", device_class)) {
10213       err = -EINVAL; // no value!
10214       goto reply;
10215     }
10216     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10217       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10218          << "luminous' before using crush device classes";
10219       err = -EPERM;
10220       goto reply;
10221     }
10222     if (!_have_pending_crush() &&
10223         _get_stable_crush().class_exists(device_class)) {
10224       ss << "class '" << device_class << "' already exists";
10225       goto reply;
10226     }
10227      CrushWrapper newcrush = _get_pending_crush();
10228      if (newcrush.class_exists(device_class)) {
10229       ss << "class '" << device_class << "' already exists";
10230       goto update;
10231     }
10232     int class_id = newcrush.get_or_create_class_id(device_class);
10233     pending_inc.crush.clear();
10234     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10235     ss << "created class " << device_class << " with id " << class_id
10236        << " to crush map";
10237     goto update;
10238   } else if (prefix == "osd crush class rm") {
10239     string device_class;
10240     if (!cmd_getval(cmdmap, "class", device_class)) {
10241        err = -EINVAL; // no value!
10242        goto reply;
10243      }
10244     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10245        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10246          << "luminous' before using crush device classes";
10247        err = -EPERM;
10248        goto reply;
10249      }
10250
10251      if (!osdmap.crush->class_exists(device_class)) {
10252        err = 0;
10253        goto reply;
10254      }
10255
10256      CrushWrapper newcrush = _get_pending_crush();
10257      if (!newcrush.class_exists(device_class)) {
10258        err = 0; // make command idempotent
10259        goto wait;
10260      }
10261      int class_id = newcrush.get_class_id(device_class);
10262      stringstream ts;
10263      if (newcrush.class_is_in_use(class_id, &ts)) {
10264        err = -EBUSY;
10265        ss << "class '" << device_class << "' " << ts.str();
10266        goto reply;
10267      }
10268
10269      // check if class is used by any erasure-code-profiles
10270      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10271        osdmap.get_erasure_code_profiles();
10272      auto ec_profiles = pending_inc.get_erasure_code_profiles();
10273 #ifdef HAVE_STDLIB_MAP_SPLICING
10274      ec_profiles.merge(old_ec_profiles);
10275 #else
10276      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10277                         make_move_iterator(end(old_ec_profiles)));
10278 #endif
10279      list<string> referenced_by;
10280      for (auto &i: ec_profiles) {
10281        for (auto &j: i.second) {
10282          if ("crush-device-class" == j.first && device_class == j.second) {
10283            referenced_by.push_back(i.first);
10284          }
10285        }
10286      }
10287      if (!referenced_by.empty()) {
10288        err = -EBUSY;
10289        ss << "class '" << device_class
10290           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10291        goto reply;
10292      }
10293
10294      set<int> osds;
10295      newcrush.get_devices_by_class(device_class, &osds);
10296      for (auto& p: osds) {
10297        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10298        if (err < 0) {
10299          // ss has reason for failure
10300          goto reply;
10301        }
10302      }
10303
10304      if (osds.empty()) {
10305        // empty class, remove directly
10306        err = newcrush.remove_class_name(device_class);
10307        if (err < 0) {
10308          ss << "class '" << device_class << "' cannot be removed '"
10309             << cpp_strerror(err) << "'";
10310          goto reply;
10311        }
10312      }
10313
10314      pending_inc.crush.clear();
10315      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10316      ss << "removed class " << device_class << " with id " << class_id
10317         << " from crush map";
10318      goto update;
10319   } else if (prefix == "osd crush class rename") {
10320     string srcname, dstname;
10321     if (!cmd_getval(cmdmap, "srcname", srcname)) {
10322       err = -EINVAL;
10323       goto reply;
10324     }
10325     if (!cmd_getval(cmdmap, "dstname", dstname)) {
10326       err = -EINVAL;
10327       goto reply;
10328     }
10329
10330     CrushWrapper newcrush = _get_pending_crush();
10331     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10332       // suppose this is a replay and return success
10333       // so command is idempotent
10334       ss << "already renamed to '" << dstname << "'";
10335       err = 0;
10336       goto reply;
10337     }
10338
10339     err = newcrush.rename_class(srcname, dstname);
10340     if (err < 0) {
10341       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10342          << cpp_strerror(err);
10343       goto reply;
10344     }
10345
10346     pending_inc.crush.clear();
10347     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10348     ss << "rename class '" << srcname << "' to '" << dstname << "'";
10349     goto update;
10350   } else if (prefix == "osd crush add-bucket") {
10351     // os crush add-bucket <name> <type>
10352     string name, typestr;
10353     vector<string> argvec;
10354     cmd_getval(cmdmap, "name", name);
10355     cmd_getval(cmdmap, "type", typestr);
10356     cmd_getval(cmdmap, "args", argvec);
10357     map<string,string> loc;
10358     if (!argvec.empty()) {
10359       CrushWrapper::parse_loc_map(argvec, &loc);
10360       dout(0) << "will create and move bucket '" << name
10361               << "' to location " << loc << dendl;
10362     }
10363
10364     if (!_have_pending_crush() &&
10365         _get_stable_crush().name_exists(name)) {
10366       ss << "bucket '" << name << "' already exists";
10367       goto reply;
10368     }
10369
10370     CrushWrapper newcrush = _get_pending_crush();
10371
10372     if (newcrush.name_exists(name)) {
10373       ss << "bucket '" << name << "' already exists";
10374       goto update;
10375     }
10376     int type = newcrush.get_type_id(typestr);
10377     if (type < 0) {
10378       ss << "type '" << typestr << "' does not exist";
10379       err = -EINVAL;
10380       goto reply;
10381     }
10382     if (type == 0) {
10383       ss << "type '" << typestr << "' is for devices, not buckets";
10384       err = -EINVAL;
10385       goto reply;
10386     }
10387     int bucketno;
10388     err = newcrush.add_bucket(0, 0,
10389                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10390                               NULL, &bucketno);
10391     if (err < 0) {
10392       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10393       goto reply;
10394     }
10395     err = newcrush.set_item_name(bucketno, name);
10396     if (err < 0) {
10397       ss << "error setting bucket name to '" << name << "'";
10398       goto reply;
10399     }
10400
10401     if (!loc.empty()) {
10402       if (!newcrush.check_item_loc(cct, bucketno, loc,
10403           (int *)NULL)) {
10404         err = newcrush.move_bucket(cct, bucketno, loc);
10405         if (err < 0) {
10406           ss << "error moving bucket '" << name << "' to location " << loc;
10407           goto reply;
10408         }
10409       } else {
10410         ss << "no need to move item id " << bucketno << " name '" << name
10411            << "' to location " << loc << " in crush map";
10412       }
10413     }
10414
10415     pending_inc.crush.clear();
10416     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10417     if (loc.empty()) {
10418       ss << "added bucket " << name << " type " << typestr
10419          << " to crush map";
10420     } else {
10421       ss << "added bucket " << name << " type " << typestr
10422          << " to location " << loc;
10423     }
10424     goto update;
10425   } else if (prefix == "osd crush rename-bucket") {
10426     string srcname, dstname;
10427     cmd_getval(cmdmap, "srcname", srcname);
10428     cmd_getval(cmdmap, "dstname", dstname);
10429
10430     err = crush_rename_bucket(srcname, dstname, &ss);
10431     if (err == -EALREADY) // equivalent to success for idempotency
10432       err = 0;
10433     if (err)
10434       goto reply;
10435     else
10436       goto update;
10437   } else if (prefix == "osd crush weight-set create" ||
10438              prefix == "osd crush weight-set create-compat") {
10439     if (_have_pending_crush()) {
10440       dout(10) << " first waiting for pending crush changes to commit" << dendl;
10441       goto wait;
10442     }
10443     CrushWrapper newcrush = _get_pending_crush();
10444     int64_t pool;
10445     int positions;
10446     if (newcrush.has_non_straw2_buckets()) {
10447       ss << "crush map contains one or more bucket(s) that are not straw2";
10448       err = -EPERM;
10449       goto reply;
10450     }
10451     if (prefix == "osd crush weight-set create") {
10452       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10453           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10454         ss << "require_min_compat_client "
10455            << osdmap.require_min_compat_client
10456            << " < luminous, which is required for per-pool weight-sets. "
10457            << "Try 'ceph osd set-require-min-compat-client luminous' "
10458            << "before using the new interface";
10459         err = -EPERM;
10460         goto reply;
10461       }
10462       string poolname, mode;
10463       cmd_getval(cmdmap, "pool", poolname);
10464       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10465       if (pool < 0) {
10466         ss << "pool '" << poolname << "' not found";
10467         err = -ENOENT;
10468         goto reply;
10469       }
10470       cmd_getval(cmdmap, "mode", mode);
10471       if (mode != "flat" && mode != "positional") {
10472         ss << "unrecognized weight-set mode '" << mode << "'";
10473         err = -EINVAL;
10474         goto reply;
10475       }
10476       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10477     } else {
10478       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10479       positions = 1;
10480     }
10481     if (!newcrush.create_choose_args(pool, positions)) {
10482       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10483         ss << "compat weight-set already created";
10484       } else {
10485         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10486            << "' already created";
10487       }
10488       goto reply;
10489     }
10490     pending_inc.crush.clear();
10491     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10492     goto update;
10493
10494   } else if (prefix == "osd crush weight-set rm" ||
10495              prefix == "osd crush weight-set rm-compat") {
10496     CrushWrapper newcrush = _get_pending_crush();
10497     int64_t pool;
10498     if (prefix == "osd crush weight-set rm") {
10499       string poolname;
10500       cmd_getval(cmdmap, "pool", poolname);
10501       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10502       if (pool < 0) {
10503         ss << "pool '" << poolname << "' not found";
10504         err = -ENOENT;
10505         goto reply;
10506       }
10507     } else {
10508       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10509     }
10510     newcrush.rm_choose_args(pool);
10511     pending_inc.crush.clear();
10512     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10513     goto update;
10514
10515   } else if (prefix == "osd crush weight-set reweight" ||
10516              prefix == "osd crush weight-set reweight-compat") {
10517     string poolname, item;
10518     vector<double> weight;
10519     cmd_getval(cmdmap, "pool", poolname);
10520     cmd_getval(cmdmap, "item", item);
10521     cmd_getval(cmdmap, "weight", weight);
10522     CrushWrapper newcrush = _get_pending_crush();
10523     int64_t pool;
10524     if (prefix == "osd crush weight-set reweight") {
10525       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10526       if (pool < 0) {
10527         ss << "pool '" << poolname << "' not found";
10528         err = -ENOENT;
10529         goto reply;
10530       }
10531       if (!newcrush.have_choose_args(pool)) {
10532         ss << "no weight-set for pool '" << poolname << "'";
10533         err = -ENOENT;
10534         goto reply;
10535       }
10536       auto arg_map = newcrush.choose_args_get(pool);
10537       int positions = newcrush.get_choose_args_positions(arg_map);
10538       if (weight.size() != (size_t)positions) {
10539          ss << "must specify exact " << positions << " weight values";
10540          err = -EINVAL;
10541          goto reply;
10542       }
10543     } else {
10544       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10545       if (!newcrush.have_choose_args(pool)) {
10546         ss << "no backward-compatible weight-set";
10547         err = -ENOENT;
10548         goto reply;
10549       }
10550     }
10551     if (!newcrush.name_exists(item)) {
10552       ss << "item '" << item << "' does not exist";
10553       err = -ENOENT;
10554       goto reply;
10555     }
10556     err = newcrush.choose_args_adjust_item_weightf(
10557       cct,
10558       newcrush.choose_args_get(pool),
10559       newcrush.get_item_id(item),
10560       weight,
10561       &ss);
10562     if (err < 0) {
10563       goto reply;
10564     }
10565     err = 0;
10566     pending_inc.crush.clear();
10567     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10568     goto update;
10569   } else if (osdid_present &&
10570              (prefix == "osd crush set" || prefix == "osd crush add")) {
10571     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10572     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10573     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10574
10575     if (!osdmap.exists(osdid)) {
10576       err = -ENOENT;
10577       ss << osd_name
10578          << " does not exist. Create it before updating the crush map";
10579       goto reply;
10580     }
10581
10582     double weight;
10583     if (!cmd_getval(cmdmap, "weight", weight)) {
10584       ss << "unable to parse weight value '"
10585          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10586       err = -EINVAL;
10587       goto reply;
10588     }
10589
10590     string args;
10591     vector<string> argvec;
10592     cmd_getval(cmdmap, "args", argvec);
10593     map<string,string> loc;
10594     CrushWrapper::parse_loc_map(argvec, &loc);
10595
10596     if (prefix == "osd crush set"
10597         && !_get_stable_crush().item_exists(osdid)) {
10598       err = -ENOENT;
10599       ss << "unable to set item id " << osdid << " name '" << osd_name
10600          << "' weight " << weight << " at location " << loc
10601          << ": does not exist";
10602       goto reply;
10603     }
10604
10605     dout(5) << "adding/updating crush item id " << osdid << " name '"
10606       << osd_name << "' weight " << weight << " at location "
10607       << loc << dendl;
10608     CrushWrapper newcrush = _get_pending_crush();
10609
10610     string action;
10611     if (prefix == "osd crush set" ||
10612         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10613       action = "set";
10614       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10615     } else {
10616       action = "add";
10617       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10618       if (err == 0)
10619         err = 1;
10620     }
10621
10622     if (err < 0)
10623       goto reply;
10624
10625     if (err == 0 && !_have_pending_crush()) {
10626       ss << action << " item id " << osdid << " name '" << osd_name
10627          << "' weight " << weight << " at location " << loc << ": no change";
10628       goto reply;
10629     }
10630
10631     pending_inc.crush.clear();
10632     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10633     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10634        << weight << " at location " << loc << " to crush map";
10635     getline(ss, rs);
10636     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10637                                                       get_last_committed() + 1));
10638     return true;
10639
10640   } else if (prefix == "osd crush create-or-move") {
10641     do {
10642       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10643       if (!osdmap.exists(osdid)) {
10644         err = -ENOENT;
10645         ss << osd_name
10646            << " does not exist.  create it before updating the crush map";
10647         goto reply;
10648       }
10649
10650       double weight;
10651       if (!cmd_getval(cmdmap, "weight", weight)) {
10652         ss << "unable to parse weight value '"
10653            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10654         err = -EINVAL;
10655         goto reply;
10656       }
10657
10658       string args;
10659       vector<string> argvec;
10660       cmd_getval(cmdmap, "args", argvec);
10661       map<string,string> loc;
10662       CrushWrapper::parse_loc_map(argvec, &loc);
10663
10664       dout(0) << "create-or-move crush item name '" << osd_name
10665               << "' initial_weight " << weight << " at location " << loc
10666               << dendl;
10667
10668       CrushWrapper newcrush = _get_pending_crush();
10669
10670       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10671                                          g_conf()->osd_crush_update_weight_set);
10672       if (err == 0) {
10673         ss << "create-or-move updated item name '" << osd_name
10674            << "' weight " << weight
10675            << " at location " << loc << " to crush map";
10676         break;
10677       }
10678       if (err > 0) {
10679         pending_inc.crush.clear();
10680         newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10681         ss << "create-or-move updating item name '" << osd_name
10682            << "' weight " << weight
10683            << " at location " << loc << " to crush map";
10684         getline(ss, rs);
10685         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10686                                                   get_last_committed() + 1));
10687         return true;
10688       }
10689     } while (false);
10690
10691   } else if (prefix == "osd crush move") {
10692     do {
10693       // osd crush move <name> <loc1> [<loc2> ...]
10694       string name;
10695       vector<string> argvec;
10696       cmd_getval(cmdmap, "name", name);
10697       cmd_getval(cmdmap, "args", argvec);
10698       map<string,string> loc;
10699       CrushWrapper::parse_loc_map(argvec, &loc);
10700
10701       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10702       CrushWrapper newcrush = _get_pending_crush();
10703
10704       if (!newcrush.name_exists(name)) {
10705         err = -ENOENT;
10706         ss << "item " << name << " does not exist";
10707         break;
10708       }
10709       int id = newcrush.get_item_id(name);
10710
10711       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10712         if (id >= 0) {
10713           err = newcrush.create_or_move_item(
10714             cct, id, 0, name, loc,
10715             g_conf()->osd_crush_update_weight_set);
10716         } else {
10717           err = newcrush.move_bucket(cct, id, loc);
10718         }
10719         if (err >= 0) {
10720           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10721           pending_inc.crush.clear();
10722           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10723           getline(ss, rs);
10724           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10725                                                    get_last_committed() + 1));
10726           return true;
10727         }
10728       } else {
10729         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10730         err = 0;
10731       }
10732     } while (false);
10733   } else if (prefix == "osd crush swap-bucket") {
10734     string source, dest;
10735     cmd_getval(cmdmap, "source", source);
10736     cmd_getval(cmdmap, "dest", dest);
10737
10738     bool force = false;
10739     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10740
10741     CrushWrapper newcrush = _get_pending_crush();
10742     if (!newcrush.name_exists(source)) {
10743       ss << "source item " << source << " does not exist";
10744       err = -ENOENT;
10745       goto reply;
10746     }
10747     if (!newcrush.name_exists(dest)) {
10748       ss << "dest item " << dest << " does not exist";
10749       err = -ENOENT;
10750       goto reply;
10751     }
10752     int sid = newcrush.get_item_id(source);
10753     int did = newcrush.get_item_id(dest);
10754     int sparent;
10755     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10756       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10757       err = -EPERM;
10758       goto reply;
10759     }
10760     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10761         !force) {
10762       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10763          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10764          << "; pass --yes-i-really-mean-it to proceed anyway";
10765       err = -EPERM;
10766       goto reply;
10767     }
10768     int r = newcrush.swap_bucket(cct, sid, did);
10769     if (r < 0) {
10770       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10771       err = r;
10772       goto reply;
10773     }
10774     ss << "swapped bucket of " << source << " to " << dest;
10775     pending_inc.crush.clear();
10776     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10777     wait_for_finished_proposal(op,
10778                                new Monitor::C_Command(mon, op, err, ss.str(),
10779                                                       get_last_committed() + 1));
10780     return true;
10781   } else if (prefix == "osd crush link") {
10782     // osd crush link <name> <loc1> [<loc2> ...]
10783     string name;
10784     cmd_getval(cmdmap, "name", name);
10785     vector<string> argvec;
10786     cmd_getval(cmdmap, "args", argvec);
10787     map<string,string> loc;
10788     CrushWrapper::parse_loc_map(argvec, &loc);
10789
10790     // Need an explicit check for name_exists because get_item_id returns
10791     // 0 on unfound.
10792     int id = osdmap.crush->get_item_id(name);
10793     if (!osdmap.crush->name_exists(name)) {
10794       err = -ENOENT;
10795       ss << "item " << name << " does not exist";
10796       goto reply;
10797     } else {
10798       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10799     }
10800     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10801       ss << "no need to move item id " << id << " name '" << name
10802          << "' to location " << loc << " in crush map";
10803       err = 0;
10804       goto reply;
10805     }
10806
10807     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10808     CrushWrapper newcrush = _get_pending_crush();
10809
10810     if (!newcrush.name_exists(name)) {
10811       err = -ENOENT;
10812       ss << "item " << name << " does not exist";
10813       goto reply;
10814     } else {
10815       int id = newcrush.get_item_id(name);
10816       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10817         err = newcrush.link_bucket(cct, id, loc);
10818         if (err >= 0) {
10819           ss << "linked item id " << id << " name '" << name
10820              << "' to location " << loc << " in crush map";
10821           pending_inc.crush.clear();
10822           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10823         } else {
10824           ss << "cannot link item id " << id << " name '" << name
10825              << "' to location " << loc;
10826           goto reply;
10827         }
10828       } else {
10829         ss << "no need to move item id " << id << " name '" << name
10830            << "' to location " << loc << " in crush map";
10831         err = 0;
10832       }
10833     }
10834     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10835                                               get_last_committed() + 1));
10836     return true;
10837   } else if (prefix == "osd crush rm" ||
10838              prefix == "osd crush remove" ||
10839              prefix == "osd crush unlink") {
10840     do {
10841       // osd crush rm <id> [ancestor]
10842       CrushWrapper newcrush = _get_pending_crush();
10843
10844       string name;
10845       cmd_getval(cmdmap, "name", name);
10846
10847       if (!osdmap.crush->name_exists(name)) {
10848         err = 0;
10849         ss << "device '" << name << "' does not appear in the crush map";
10850         break;
10851       }
10852       if (!newcrush.name_exists(name)) {
10853         err = 0;
10854         ss << "device '" << name << "' does not appear in the crush map";
10855         getline(ss, rs);
10856         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10857                                                   get_last_committed() + 1));
10858         return true;
10859       }
10860       int id = newcrush.get_item_id(name);
10861       int ancestor = 0;
10862
10863       bool unlink_only = prefix == "osd crush unlink";
10864       string ancestor_str;
10865       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10866         if (!newcrush.name_exists(ancestor_str)) {
10867           err = -ENOENT;
10868           ss << "ancestor item '" << ancestor_str
10869              << "' does not appear in the crush map";
10870           break;
10871         }
10872         ancestor = newcrush.get_item_id(ancestor_str);
10873       }
10874
10875       err = prepare_command_osd_crush_remove(
10876           newcrush,
10877           id, ancestor,
10878           (ancestor < 0), unlink_only);
10879
10880       if (err == -ENOENT) {
10881         ss << "item " << id << " does not appear in that position";
10882         err = 0;
10883         break;
10884       }
10885       if (err == 0) {
10886         if (!unlink_only)
10887           pending_inc.new_crush_node_flags[id] = 0;
10888         ss << "removed item id " << id << " name '" << name << "' from crush map";
10889         getline(ss, rs);
10890         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10891                                                   get_last_committed() + 1));
10892         return true;
10893       }
10894     } while (false);
10895
10896   } else if (prefix == "osd crush reweight-all") {
10897     CrushWrapper newcrush = _get_pending_crush();
10898
10899     newcrush.reweight(cct);
10900     pending_inc.crush.clear();
10901     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10902     ss << "reweighted crush hierarchy";
10903     getline(ss, rs);
10904     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10905                                                   get_last_committed() + 1));
10906     return true;
10907   } else if (prefix == "osd crush reweight") {
10908     // osd crush reweight <name> <weight>
10909     CrushWrapper newcrush = _get_pending_crush();
10910
10911     string name;
10912     cmd_getval(cmdmap, "name", name);
10913     if (!newcrush.name_exists(name)) {
10914       err = -ENOENT;
10915       ss << "device '" << name << "' does not appear in the crush map";
10916       goto reply;
10917     }
10918
10919     int id = newcrush.get_item_id(name);
10920     if (id < 0) {
10921       ss << "device '" << name << "' is not a leaf in the crush map";
10922       err = -EINVAL;
10923       goto reply;
10924     }
10925     double w;
10926     if (!cmd_getval(cmdmap, "weight", w)) {
10927       ss << "unable to parse weight value '"
10928          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10929       err = -EINVAL;
10930       goto reply;
10931     }
10932
10933     err = newcrush.adjust_item_weightf(cct, id, w,
10934                                        g_conf()->osd_crush_update_weight_set);
10935     if (err < 0)
10936       goto reply;
10937     pending_inc.crush.clear();
10938     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10939     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10940        << " in crush map";
10941     getline(ss, rs);
10942     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10943                                                   get_last_committed() + 1));
10944     return true;
10945   } else if (prefix == "osd crush reweight-subtree") {
10946     // osd crush reweight <name> <weight>
10947     CrushWrapper newcrush = _get_pending_crush();
10948
10949     string name;
10950     cmd_getval(cmdmap, "name", name);
10951     if (!newcrush.name_exists(name)) {
10952       err = -ENOENT;
10953       ss << "device '" << name << "' does not appear in the crush map";
10954       goto reply;
10955     }
10956
10957     int id = newcrush.get_item_id(name);
10958     if (id >= 0) {
10959       ss << "device '" << name << "' is not a subtree in the crush map";
10960       err = -EINVAL;
10961       goto reply;
10962     }
10963     double w;
10964     if (!cmd_getval(cmdmap, "weight", w)) {
10965       ss << "unable to parse weight value '"
10966          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10967       err = -EINVAL;
10968       goto reply;
10969     }
10970
10971     err = newcrush.adjust_subtree_weightf(cct, id, w,
10972                                           g_conf()->osd_crush_update_weight_set);
10973     if (err < 0)
10974       goto reply;
10975     pending_inc.crush.clear();
10976     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10977     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10978        << " in crush map";
10979     getline(ss, rs);
10980     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10981                                               get_last_committed() + 1));
10982     return true;
10983   } else if (prefix == "osd crush tunables") {
10984     CrushWrapper newcrush = _get_pending_crush();
10985
10986     err = 0;
10987     string profile;
10988     cmd_getval(cmdmap, "profile", profile);
10989     if (profile == "legacy" || profile == "argonaut") {
10990       newcrush.set_tunables_legacy();
10991     } else if (profile == "bobtail") {
10992       newcrush.set_tunables_bobtail();
10993     } else if (profile == "firefly") {
10994       newcrush.set_tunables_firefly();
10995     } else if (profile == "hammer") {
10996       newcrush.set_tunables_hammer();
10997     } else if (profile == "jewel") {
10998       newcrush.set_tunables_jewel();
10999     } else if (profile == "optimal") {
11000       newcrush.set_tunables_optimal();
11001     } else if (profile == "default") {
11002       newcrush.set_tunables_default();
11003     } else {
11004       ss << "unrecognized profile '" << profile << "'";
11005       err = -EINVAL;
11006       goto reply;
11007     }
11008
11009     if (!validate_crush_against_features(&newcrush, ss)) {
11010       err = -EINVAL;
11011       goto reply;
11012     }
11013
11014     pending_inc.crush.clear();
11015     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11016     ss << "adjusted tunables profile to " << profile;
11017     getline(ss, rs);
11018     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11019                                               get_last_committed() + 1));
11020     return true;
11021   } else if (prefix == "osd crush set-tunable") {
11022     CrushWrapper newcrush = _get_pending_crush();
11023
11024     err = 0;
11025     string tunable;
11026     cmd_getval(cmdmap, "tunable", tunable);
11027
11028     int64_t value = -1;
11029     if (!cmd_getval(cmdmap, "value", value)) {
11030       err = -EINVAL;
11031       ss << "failed to parse integer value "
11032          << cmd_vartype_stringify(cmdmap.at("value"));
11033       goto reply;
11034     }
11035
11036     if (tunable == "straw_calc_version") {
11037       if (value != 0 && value != 1) {
11038         ss << "value must be 0 or 1; got " << value;
11039         err = -EINVAL;
11040         goto reply;
11041       }
11042       newcrush.set_straw_calc_version(value);
11043     } else {
11044       ss << "unrecognized tunable '" << tunable << "'";
11045       err = -EINVAL;
11046       goto reply;
11047     }
11048
11049     if (!validate_crush_against_features(&newcrush, ss)) {
11050       err = -EINVAL;
11051       goto reply;
11052     }
11053
11054     pending_inc.crush.clear();
11055     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11056     ss << "adjusted tunable " << tunable << " to " << value;
11057     getline(ss, rs);
11058     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11059                                               get_last_committed() + 1));
11060     return true;
11061
11062   } else if (prefix == "osd crush rule create-simple") {
11063     string name, root, type, mode;
11064     cmd_getval(cmdmap, "name", name);
11065     cmd_getval(cmdmap, "root", root);
11066     cmd_getval(cmdmap, "type", type);
11067     cmd_getval(cmdmap, "mode", mode);
11068     if (mode == "")
11069       mode = "firstn";
11070
11071     if (osdmap.crush->rule_exists(name)) {
11072       // The name is uniquely associated to a ruleid and the rule it contains
11073       // From the user point of view, the rule is more meaningfull.
11074       ss << "rule " << name << " already exists";
11075       err = 0;
11076       goto reply;
11077     }
11078
11079     CrushWrapper newcrush = _get_pending_crush();
11080
11081     if (newcrush.rule_exists(name)) {
11082       // The name is uniquely associated to a ruleid and the rule it contains
11083       // From the user point of view, the rule is more meaningfull.
11084       ss << "rule " << name << " already exists";
11085       err = 0;
11086     } else {
11087       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
11088                                                pg_pool_t::TYPE_REPLICATED, &ss);
11089       if (ruleno < 0) {
11090         err = ruleno;
11091         goto reply;
11092       }
11093
11094       pending_inc.crush.clear();
11095       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11096     }
11097     getline(ss, rs);
11098     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11099                                               get_last_committed() + 1));
11100     return true;
11101
11102   } else if (prefix == "osd crush rule create-replicated") {
11103     string name, root, type, device_class;
11104     cmd_getval(cmdmap, "name", name);
11105     cmd_getval(cmdmap, "root", root);
11106     cmd_getval(cmdmap, "type", type);
11107     cmd_getval(cmdmap, "class", device_class);
11108
11109     if (osdmap.crush->rule_exists(name)) {
11110       // The name is uniquely associated to a ruleid and the rule it contains
11111       // From the user point of view, the rule is more meaningfull.
11112       ss << "rule " << name << " already exists";
11113       err = 0;
11114       goto reply;
11115     }
11116
11117     CrushWrapper newcrush = _get_pending_crush();
11118
11119     if (newcrush.rule_exists(name)) {
11120       // The name is uniquely associated to a ruleid and the rule it contains
11121       // From the user point of view, the rule is more meaningfull.
11122       ss << "rule " << name << " already exists";
11123       err = 0;
11124     } else {
11125       int ruleno = newcrush.add_simple_rule(
11126         name, root, type, device_class,
11127         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
11128       if (ruleno < 0) {
11129         err = ruleno;
11130         goto reply;
11131       }
11132
11133       pending_inc.crush.clear();
11134       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11135     }
11136     getline(ss, rs);
11137     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11138                                               get_last_committed() + 1));
11139     return true;
11140
11141   } else if (prefix == "osd erasure-code-profile rm") {
11142     string name;
11143     cmd_getval(cmdmap, "name", name);
11144
11145     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
11146       goto wait;
11147
11148     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
11149       err = -EBUSY;
11150       goto reply;
11151     }
11152
11153     if (osdmap.has_erasure_code_profile(name) ||
11154         pending_inc.new_erasure_code_profiles.count(name)) {
11155       if (osdmap.has_erasure_code_profile(name)) {
11156         pending_inc.old_erasure_code_profiles.push_back(name);
11157       } else {
11158         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
11159         pending_inc.new_erasure_code_profiles.erase(name);
11160       }
11161
11162       getline(ss, rs);
11163       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11164                                                         get_last_committed() + 1));
11165       return true;
11166     } else {
11167       ss << "erasure-code-profile " << name << " does not exist";
11168       err = 0;
11169       goto reply;
11170     }
11171
11172   } else if (prefix == "osd erasure-code-profile set") {
11173     string name;
11174     cmd_getval(cmdmap, "name", name);
11175     vector<string> profile;
11176     cmd_getval(cmdmap, "profile", profile);
11177
11178     bool force = false;
11179     cmd_getval(cmdmap, "force", force);
11180
11181     map<string,string> profile_map;
11182     err = parse_erasure_code_profile(profile, &profile_map, &ss);
11183     if (err)
11184       goto reply;
11185     if (auto found = profile_map.find("crush-failure-domain");
11186         found != profile_map.end()) {
11187       const auto& failure_domain = found->second;
11188       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
11189       if (failure_domain_type < 0) {
11190         ss << "erasure-code-profile " << profile_map
11191           << " contains an invalid failure-domain " << std::quoted(failure_domain);
11192         err = -EINVAL;
11193         goto reply;
11194       }
11195     }
11196
11197     if (profile_map.find("plugin") == profile_map.end()) {
11198       ss << "erasure-code-profile " << profile_map
11199          << " must contain a plugin entry" << std::endl;
11200       err = -EINVAL;
11201       goto reply;
11202     }
11203     string plugin = profile_map["plugin"];
11204
11205     if (pending_inc.has_erasure_code_profile(name)) {
11206       dout(20) << "erasure code profile " << name << " try again" << dendl;
11207       goto wait;
11208     } else {
11209       err = normalize_profile(name, profile_map, force, &ss);
11210       if (err)
11211         goto reply;
11212
11213       if (osdmap.has_erasure_code_profile(name)) {
11214         ErasureCodeProfile existing_profile_map =
11215           osdmap.get_erasure_code_profile(name);
11216         err = normalize_profile(name, existing_profile_map, force, &ss);
11217         if (err)
11218           goto reply;
11219
11220         if (existing_profile_map == profile_map) {
11221           err = 0;
11222           goto reply;
11223         }
11224         if (!force) {
11225           err = -EPERM;
11226           ss << "will not override erasure code profile " << name
11227              << " because the existing profile "
11228              << existing_profile_map
11229              << " is different from the proposed profile "
11230              << profile_map;
11231           goto reply;
11232         }
11233       }
11234
11235       dout(20) << "erasure code profile set " << name << "="
11236                << profile_map << dendl;
11237       pending_inc.set_erasure_code_profile(name, profile_map);
11238     }
11239
11240     getline(ss, rs);
11241     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11242                                                       get_last_committed() + 1));
11243     return true;
11244
11245   } else if (prefix == "osd crush rule create-erasure") {
11246     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11247     if (err == -EAGAIN)
11248       goto wait;
11249     if (err)
11250       goto reply;
11251     string name, poolstr;
11252     cmd_getval(cmdmap, "name", name);
11253     string profile;
11254     cmd_getval(cmdmap, "profile", profile);
11255     if (profile == "")
11256       profile = "default";
11257     if (profile == "default") {
11258       if (!osdmap.has_erasure_code_profile(profile)) {
11259         if (pending_inc.has_erasure_code_profile(profile)) {
11260           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11261           goto wait;
11262         }
11263
11264         map<string,string> profile_map;
11265         err = osdmap.get_erasure_code_profile_default(cct,
11266                                                       profile_map,
11267                                                       &ss);
11268         if (err)
11269           goto reply;
11270         err = normalize_profile(name, profile_map, true, &ss);
11271         if (err)
11272           goto reply;
11273         dout(20) << "erasure code profile set " << profile << "="
11274                  << profile_map << dendl;
11275         pending_inc.set_erasure_code_profile(profile, profile_map);
11276         goto wait;
11277       }
11278     }
11279
11280     int rule;
11281     err = crush_rule_create_erasure(name, profile, &rule, &ss);
11282     if (err < 0) {
11283       switch(err) {
11284       case -EEXIST: // return immediately
11285         ss << "rule " << name << " already exists";
11286         err = 0;
11287         goto reply;
11288         break;
11289       case -EALREADY: // wait for pending to be proposed
11290         ss << "rule " << name << " already exists";
11291         err = 0;
11292         break;
11293       default: // non recoverable error
11294         goto reply;
11295         break;
11296       }
11297     } else {
11298       ss << "created rule " << name << " at " << rule;
11299     }
11300
11301     getline(ss, rs);
11302     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11303                                                       get_last_committed() + 1));
11304     return true;
11305
11306   } else if (prefix == "osd crush rule rm") {
11307     string name;
11308     cmd_getval(cmdmap, "name", name);
11309
11310     if (!osdmap.crush->rule_exists(name)) {
11311       ss << "rule " << name << " does not exist";
11312       err = 0;
11313       goto reply;
11314     }
11315
11316     CrushWrapper newcrush = _get_pending_crush();
11317
11318     if (!newcrush.rule_exists(name)) {
11319       ss << "rule " << name << " does not exist";
11320       err = 0;
11321     } else {
11322       int ruleno = newcrush.get_rule_id(name);
11323       ceph_assert(ruleno >= 0);
11324
11325       // make sure it is not in use.
11326       // FIXME: this is ok in some situations, but let's not bother with that
11327       // complexity now.
11328       if (osdmap.crush_rule_in_use(ruleno)) {
11329         ss << "crush rule " << name << " (" << ruleno << ") is in use";
11330         err = -EBUSY;
11331         goto reply;
11332       }
11333
11334       err = newcrush.remove_rule(ruleno);
11335       if (err < 0) {
11336         goto reply;
11337       }
11338
11339       pending_inc.crush.clear();
11340       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11341     }
11342     getline(ss, rs);
11343     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11344                                               get_last_committed() + 1));
11345     return true;
11346
11347   } else if (prefix == "osd crush rule rename") {
11348     string srcname;
11349     string dstname;
11350     cmd_getval(cmdmap, "srcname", srcname);
11351     cmd_getval(cmdmap, "dstname", dstname);
11352     if (srcname.empty() || dstname.empty()) {
11353       ss << "must specify both source rule name and destination rule name";
11354       err = -EINVAL;
11355       goto reply;
11356     }
11357     if (srcname == dstname) {
11358       ss << "destination rule name is equal to source rule name";
11359       err = 0;
11360       goto reply;
11361     }
11362
11363     CrushWrapper newcrush = _get_pending_crush();
11364     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11365       // srcname does not exist and dstname already exists
11366       // suppose this is a replay and return success
11367       // (so this command is idempotent)
11368       ss << "already renamed to '" << dstname << "'";
11369       err = 0;
11370       goto reply;
11371     }
11372
11373     err = newcrush.rename_rule(srcname, dstname, &ss);
11374     if (err < 0) {
11375       // ss has reason for failure
11376       goto reply;
11377     }
11378     pending_inc.crush.clear();
11379     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11380     getline(ss, rs);
11381     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11382                                get_last_committed() + 1));
11383     return true;
11384
11385   } else if (prefix == "osd setmaxosd") {
11386     int64_t newmax;
11387     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11388       ss << "unable to parse 'newmax' value '"
11389          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11390       err = -EINVAL;
11391       goto reply;
11392     }
11393
11394     if (newmax > g_conf()->mon_max_osd) {
11395       err = -ERANGE;
11396       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11397          << g_conf()->mon_max_osd << ")";
11398       goto reply;
11399     }
11400
11401     // Don't allow shrinking OSD number as this will cause data loss
11402     // and may cause kernel crashes.
11403     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11404     if (newmax < osdmap.get_max_osd()) {
11405       // Check if the OSDs exist between current max and new value.
11406       // If there are any OSDs exist, then don't allow shrinking number
11407       // of OSDs.
11408       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11409         if (osdmap.exists(i)) {
11410           err = -EBUSY;
11411           ss << "cannot shrink max_osd to " << newmax
11412              << " because osd." << i << " (and possibly others) still in use";
11413           goto reply;
11414         }
11415       }
11416     }
11417
11418     pending_inc.new_max_osd = newmax;
11419     ss << "set new max_osd = " << pending_inc.new_max_osd;
11420     getline(ss, rs);
11421     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11422                                               get_last_committed() + 1));
11423     return true;
11424
11425   } else if (prefix == "osd set-full-ratio" ||
11426              prefix == "osd set-backfillfull-ratio" ||
11427              prefix == "osd set-nearfull-ratio") {
11428     double n;
11429     if (!cmd_getval(cmdmap, "ratio", n)) {
11430       ss << "unable to parse 'ratio' value '"
11431          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11432       err = -EINVAL;
11433       goto reply;
11434     }
11435     if (prefix == "osd set-full-ratio")
11436       pending_inc.new_full_ratio = n;
11437     else if (prefix == "osd set-backfillfull-ratio")
11438       pending_inc.new_backfillfull_ratio = n;
11439     else if (prefix == "osd set-nearfull-ratio")
11440       pending_inc.new_nearfull_ratio = n;
11441     ss << prefix << " " << n;
11442     getline(ss, rs);
11443     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11444                                               get_last_committed() + 1));
11445     return true;
11446   } else if (prefix == "osd set-require-min-compat-client") {
11447     string v;
11448     cmd_getval(cmdmap, "version", v);
11449     ceph_release_t vno = ceph_release_from_name(v);
11450     if (!vno) {
11451       ss << "version " << v << " is not recognized";
11452       err = -EINVAL;
11453       goto reply;
11454     }
11455     OSDMap newmap;
11456     newmap.deepish_copy_from(osdmap);
11457     newmap.apply_incremental(pending_inc);
11458     newmap.require_min_compat_client = vno;
11459     auto mvno = newmap.get_min_compat_client();
11460     if (vno < mvno) {
11461       ss << "osdmap current utilizes features that require " << mvno
11462          << "; cannot set require_min_compat_client below that to " << vno;
11463       err = -EPERM;
11464       goto reply;
11465     }
11466     bool sure = false;
11467     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11468     if (!sure) {
11469       FeatureMap m;
11470       mon.get_combined_feature_map(&m);
11471       uint64_t features = ceph_release_features(to_integer<int>(vno));
11472       bool first = true;
11473       bool ok = true;
11474       for (int type : {
11475             CEPH_ENTITY_TYPE_CLIENT,
11476             CEPH_ENTITY_TYPE_MDS,
11477             CEPH_ENTITY_TYPE_MGR }) {
11478         auto p = m.m.find(type);
11479         if (p == m.m.end()) {
11480           continue;
11481         }
11482         for (auto& q : p->second) {
11483           uint64_t missing = ~q.first & features;
11484           if (missing) {
11485             if (first) {
11486               ss << "cannot set require_min_compat_client to " << v << ": ";
11487             } else {
11488               ss << "; ";
11489             }
11490             first = false;
11491             ss << q.second << " connected " << ceph_entity_type_name(type)
11492                << "(s) look like " << ceph_release_name(
11493                  ceph_release_from_features(q.first))
11494                << " (missing 0x" << std::hex << missing << std::dec << ")";
11495             ok = false;
11496           }
11497         }
11498       }
11499       if (!ok) {
11500         ss << "; add --yes-i-really-mean-it to do it anyway";
11501         err = -EPERM;
11502         goto reply;
11503       }
11504     }
11505     ss << "set require_min_compat_client to " << vno;
11506     pending_inc.new_require_min_compat_client = vno;
11507     getline(ss, rs);
11508     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11509                                                           get_last_committed() + 1));
11510     return true;
11511   } else if (prefix == "osd pause") {
11512     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11513
11514   } else if (prefix == "osd unpause") {
11515     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11516
11517   } else if (prefix == "osd set") {
11518     bool sure = false;
11519     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11520
11521     string key;
11522     cmd_getval(cmdmap, "key", key);
11523     if (key == "pause")
11524       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11525     else if (key == "noup")
11526       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11527     else if (key == "nodown")
11528       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11529     else if (key == "noout")
11530       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11531     else if (key == "noin")
11532       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11533     else if (key == "nobackfill")
11534       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11535     else if (key == "norebalance")
11536       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11537     else if (key == "norecover")
11538       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11539     else if (key == "noscrub")
11540       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11541     else if (key == "nodeep-scrub")
11542       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11543     else if (key == "notieragent")
11544       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11545     else if (key == "nosnaptrim")
11546       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11547     else if (key == "pglog_hardlimit") {
11548       if (!osdmap.get_num_up_osds() && !sure) {
11549         ss << "Not advisable to continue since no OSDs are up. Pass "
11550            << "--yes-i-really-mean-it if you really wish to continue.";
11551         err = -EPERM;
11552         goto reply;
11553       }
11554       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11555       // we are reusing a jewel feature bit that was retired in luminous.
11556       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11557          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11558           || sure)) {
11559         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11560       } else {
11561         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11562         err = -EPERM;
11563         goto reply;
11564       }
11565     } else {
11566       ss << "unrecognized flag '" << key << "'";
11567       err = -EINVAL;
11568     }
11569
11570   } else if (prefix == "osd unset") {
11571     string key;
11572     cmd_getval(cmdmap, "key", key);
11573     if (key == "pause")
11574       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11575     else if (key == "noup")
11576       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11577     else if (key == "nodown")
11578       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11579     else if (key == "noout")
11580       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11581     else if (key == "noin")
11582       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11583     else if (key == "nobackfill")
11584       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11585     else if (key == "norebalance")
11586       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11587     else if (key == "norecover")
11588       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11589     else if (key == "noscrub")
11590       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11591     else if (key == "nodeep-scrub")
11592       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11593     else if (key == "notieragent")
11594       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11595     else if (key == "nosnaptrim")
11596       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11597     else {
11598       ss << "unrecognized flag '" << key << "'";
11599       err = -EINVAL;
11600     }
11601
11602   } else if (prefix == "osd require-osd-release") {
11603     string release;
11604     cmd_getval(cmdmap, "release", release);
11605     bool sure = false;
11606     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11607     ceph_release_t rel = ceph_release_from_name(release.c_str());
11608     if (!rel) {
11609       ss << "unrecognized release " << release;
11610       err = -EINVAL;
11611       goto reply;
11612     }
11613     if (rel == osdmap.require_osd_release) {
11614       // idempotent
11615       err = 0;
11616       goto reply;
11617     }
11618     ceph_assert(osdmap.require_osd_release >= ceph_release_t::octopus);
11619     if (!osdmap.get_num_up_osds() && !sure) {
11620       ss << "Not advisable to continue since no OSDs are up. Pass "
11621          << "--yes-i-really-mean-it if you really wish to continue.";
11622       err = -EPERM;
11623       goto reply;
11624     }
11625     if (rel == ceph_release_t::octopus) {
11626       if (!mon.monmap->get_required_features().contains_all(
11627             ceph::features::mon::FEATURE_OCTOPUS)) {
11628         ss << "not all mons are octopus";
11629         err = -EPERM;
11630         goto reply;
11631       }
11632       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11633            && !sure) {
11634         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11635         err = -EPERM;
11636         goto reply;
11637       }
11638     } else if (rel == ceph_release_t::pacific) {
11639       if (!mon.monmap->get_required_features().contains_all(
11640             ceph::features::mon::FEATURE_PACIFIC)) {
11641         ss << "not all mons are pacific";
11642         err = -EPERM;
11643         goto reply;
11644       }
11645       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11646            && !sure) {
11647         ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11648         err = -EPERM;
11649         goto reply;
11650       }
11651     } else if (rel == ceph_release_t::quincy) {
11652       if (!mon.monmap->get_required_features().contains_all(
11653             ceph::features::mon::FEATURE_QUINCY)) {
11654         ss << "not all mons are quincy";
11655         err = -EPERM;
11656         goto reply;
11657       }
11658       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_QUINCY))
11659            && !sure) {
11660         ss << "not all up OSDs have CEPH_FEATURE_SERVER_QUINCY feature";
11661         err = -EPERM;
11662         goto reply;
11663       }
11664     } else {
11665       ss << "not supported for this release";
11666       err = -EPERM;
11667       goto reply;
11668     }
11669     if (rel < osdmap.require_osd_release) {
11670       ss << "require_osd_release cannot be lowered once it has been set";
11671       err = -EPERM;
11672       goto reply;
11673     }
11674     pending_inc.new_require_osd_release = rel;
11675     goto update;
11676   } else if (prefix == "osd down" ||
11677              prefix == "osd out" ||
11678              prefix == "osd in" ||
11679              prefix == "osd rm" ||
11680              prefix == "osd stop") {
11681
11682     bool any = false;
11683     bool stop = false;
11684     bool verbose = true;
11685     bool definitely_dead = false;
11686
11687     vector<string> idvec;
11688     cmd_getval(cmdmap, "ids", idvec);
11689     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11690     derr << "definitely_dead " << (int)definitely_dead << dendl;
11691     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11692       set<int> osds;
11693
11694       // wildcard?
11695       if (j == 0 &&
11696           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11697         if (prefix == "osd in") {
11698           // touch out osds only
11699           osdmap.get_out_existing_osds(osds);
11700         } else {
11701           osdmap.get_all_osds(osds);
11702         }
11703         stop = true;
11704         verbose = false; // so the output is less noisy.
11705       } else {
11706         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11707         if (osd < 0) {
11708           ss << "invalid osd id" << osd;
11709           err = -EINVAL;
11710           continue;
11711         } else if (!osdmap.exists(osd)) {
11712           ss << "osd." << osd << " does not exist. ";
11713           continue;
11714         }
11715
11716         osds.insert(osd);
11717       }
11718
11719       for (auto &osd : osds) {
11720         if (prefix == "osd down") {
11721           if (osdmap.is_down(osd)) {
11722             if (verbose)
11723               ss << "osd." << osd << " is already down. ";
11724           } else {
11725             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11726             ss << "marked down osd." << osd << ". ";
11727             any = true;
11728           }
11729           if (definitely_dead) {
11730             if (!pending_inc.new_xinfo.count(osd)) {
11731               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11732             }
11733             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11734               any = true;
11735             }
11736             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11737           }
11738         } else if (prefix == "osd out") {
11739           if (osdmap.is_out(osd)) {
11740             if (verbose)
11741               ss << "osd." << osd << " is already out. ";
11742           } else {
11743             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11744             if (osdmap.osd_weight[osd]) {
11745               if (pending_inc.new_xinfo.count(osd) == 0) {
11746                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11747               }
11748               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11749             }
11750             ss << "marked out osd." << osd << ". ";
11751             std::ostringstream msg;
11752             msg << "Client " << op->get_session()->entity_name
11753                 << " marked osd." << osd << " out";
11754             if (osdmap.is_up(osd)) {
11755               msg << ", while it was still marked up";
11756             } else {
11757               auto period = ceph_clock_now() - down_pending_out[osd];
11758               msg << ", after it was down for " << int(period.sec())
11759                   << " seconds";
11760             }
11761
11762             mon.clog->info() << msg.str();
11763             any = true;
11764           }
11765         } else if (prefix == "osd in") {
11766           if (osdmap.is_in(osd)) {
11767             if (verbose)
11768               ss << "osd." << osd << " is already in. ";
11769           } else {
11770             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11771               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11772               if (pending_inc.new_xinfo.count(osd) == 0) {
11773                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11774               }
11775               pending_inc.new_xinfo[osd].old_weight = 0;
11776             } else {
11777               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11778             }
11779             ss << "marked in osd." << osd << ". ";
11780             any = true;
11781           }
11782         } else if (prefix == "osd rm") {
11783           err = prepare_command_osd_remove(osd);
11784
11785           if (err == -EBUSY) {
11786             if (any)
11787               ss << ", ";
11788             ss << "osd." << osd << " is still up; must be down before removal. ";
11789           } else {
11790             ceph_assert(err == 0);
11791             if (any) {
11792               ss << ", osd." << osd;
11793             } else {
11794               ss << "removed osd." << osd;
11795             }
11796             any = true;
11797           }
11798         } else if (prefix == "osd stop") {
11799           if (osdmap.is_stop(osd)) {
11800             if (verbose)
11801               ss << "osd." << osd << " is already stopped. ";
11802           } else if (osdmap.is_down(osd)) {
11803             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11804             ss << "stop down osd." << osd << ". ";
11805             any = true;
11806           } else {
11807             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11808             ss << "stop osd." << osd << ". ";
11809             any = true;
11810           }
11811         }
11812       }
11813     }
11814     if (any) {
11815       getline(ss, rs);
11816       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11817                                                 get_last_committed() + 1));
11818       return true;
11819     }
11820   } else if (prefix == "osd set-group" ||
11821              prefix == "osd unset-group" ||
11822              prefix == "osd add-noup" ||
11823              prefix == "osd add-nodown" ||
11824              prefix == "osd add-noin" ||
11825              prefix == "osd add-noout" ||
11826              prefix == "osd rm-noup" ||
11827              prefix == "osd rm-nodown" ||
11828              prefix == "osd rm-noin" ||
11829              prefix == "osd rm-noout") {
11830     bool do_set = prefix == "osd set-group" ||
11831                   prefix.find("add") != string::npos;
11832     string flag_str;
11833     unsigned flags = 0;
11834     vector<string> who;
11835     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11836       cmd_getval(cmdmap, "flags", flag_str);
11837       cmd_getval(cmdmap, "who", who);
11838       vector<string> raw_flags;
11839       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11840       for (auto& f : raw_flags) {
11841         if (f == "noup")
11842           flags |= CEPH_OSD_NOUP;
11843         else if (f == "nodown")
11844           flags |= CEPH_OSD_NODOWN;
11845         else if (f == "noin")
11846           flags |= CEPH_OSD_NOIN;
11847         else if (f == "noout")
11848           flags |= CEPH_OSD_NOOUT;
11849         else {
11850           ss << "unrecognized flag '" << f << "', must be one of "
11851              << "{noup,nodown,noin,noout}";
11852           err = -EINVAL;
11853           goto reply;
11854         }
11855       }
11856     } else {
11857       cmd_getval(cmdmap, "ids", who);
11858       if (prefix.find("noup") != string::npos)
11859         flags = CEPH_OSD_NOUP;
11860       else if (prefix.find("nodown") != string::npos)
11861         flags = CEPH_OSD_NODOWN;
11862       else if (prefix.find("noin") != string::npos)
11863         flags = CEPH_OSD_NOIN;
11864       else if (prefix.find("noout") != string::npos)
11865         flags = CEPH_OSD_NOOUT;
11866       else
11867         ceph_assert(0 == "Unreachable!");
11868     }
11869     if (flags == 0) {
11870       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11871       err = -EINVAL;
11872       goto reply;
11873     }
11874     if (who.empty()) {
11875       ss << "must specify at least one or more targets to set/unset";
11876       err = -EINVAL;
11877       goto reply;
11878     }
11879     set<int> osds;
11880     set<int> crush_nodes;
11881     set<int> device_classes;
11882     for (auto& w : who) {
11883       if (w == "any" || w == "all" || w == "*") {
11884         osdmap.get_all_osds(osds);
11885         break;
11886       }
11887       std::stringstream ts;
11888       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11889         osds.insert(osd);
11890       } else if (osdmap.crush->name_exists(w)) {
11891         crush_nodes.insert(osdmap.crush->get_item_id(w));
11892       } else if (osdmap.crush->class_exists(w)) {
11893         device_classes.insert(osdmap.crush->get_class_id(w));
11894       } else {
11895         ss << "unable to parse osd id or crush node or device class: "
11896            << "\"" << w << "\". ";
11897       }
11898     }
11899     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11900       // ss has reason for failure
11901       err = -EINVAL;
11902       goto reply;
11903     }
11904     bool any = false;
11905     for (auto osd : osds) {
11906       if (!osdmap.exists(osd)) {
11907         ss << "osd." << osd << " does not exist. ";
11908         continue;
11909       }
11910       if (do_set) {
11911         if (flags & CEPH_OSD_NOUP) {
11912           any |= osdmap.is_noup_by_osd(osd) ?
11913             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11914             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11915         }
11916         if (flags & CEPH_OSD_NODOWN) {
11917           any |= osdmap.is_nodown_by_osd(osd) ?
11918             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11919             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11920         }
11921         if (flags & CEPH_OSD_NOIN) {
11922           any |= osdmap.is_noin_by_osd(osd) ?
11923             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11924             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11925         }
11926         if (flags & CEPH_OSD_NOOUT) {
11927           any |= osdmap.is_noout_by_osd(osd) ?
11928             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11929             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11930         }
11931       } else {
11932         if (flags & CEPH_OSD_NOUP) {
11933           any |= osdmap.is_noup_by_osd(osd) ?
11934             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11935             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11936         }
11937         if (flags & CEPH_OSD_NODOWN) {
11938           any |= osdmap.is_nodown_by_osd(osd) ?
11939             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11940             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11941         }
11942         if (flags & CEPH_OSD_NOIN) {
11943           any |= osdmap.is_noin_by_osd(osd) ?
11944             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11945             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11946         }
11947         if (flags & CEPH_OSD_NOOUT) {
11948           any |= osdmap.is_noout_by_osd(osd) ?
11949             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11950             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11951         }
11952       }
11953     }
11954     for (auto& id : crush_nodes) {
11955       auto old_flags = osdmap.get_crush_node_flags(id);
11956       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11957       pending_flags |= old_flags; // adopt existing flags first!
11958       if (do_set) {
11959         pending_flags |= flags;
11960       } else {
11961         pending_flags &= ~flags;
11962       }
11963       any = true;
11964     }
11965     for (auto& id : device_classes) {
11966       auto old_flags = osdmap.get_device_class_flags(id);
11967       auto& pending_flags = pending_inc.new_device_class_flags[id];
11968       pending_flags |= old_flags;
11969       if (do_set) {
11970         pending_flags |= flags;
11971       } else {
11972         pending_flags &= ~flags;
11973       }
11974       any = true;
11975     }
11976     if (any) {
11977       getline(ss, rs);
11978       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11979                                  get_last_committed() + 1));
11980       return true;
11981     }
11982   } else if (prefix == "osd pg-temp") {
11983     string pgidstr;
11984     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11985       ss << "unable to parse 'pgid' value '"
11986          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11987       err = -EINVAL;
11988       goto reply;
11989     }
11990     pg_t pgid;
11991     if (!pgid.parse(pgidstr.c_str())) {
11992       ss << "invalid pgid '" << pgidstr << "'";
11993       err = -EINVAL;
11994       goto reply;
11995     }
11996     if (!osdmap.pg_exists(pgid)) {
11997       ss << "pg " << pgid << " does not exist";
11998       err = -ENOENT;
11999       goto reply;
12000     }
12001     if (pending_inc.new_pg_temp.count(pgid)) {
12002       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
12003       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12004       return true;
12005     }
12006
12007     vector<int64_t> id_vec;
12008     vector<int32_t> new_pg_temp;
12009     cmd_getval(cmdmap, "id", id_vec);
12010     if (id_vec.empty())  {
12011       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
12012       ss << "done cleaning up pg_temp of " << pgid;
12013       goto update;
12014     }
12015     for (auto osd : id_vec) {
12016       if (!osdmap.exists(osd)) {
12017         ss << "osd." << osd << " does not exist";
12018         err = -ENOENT;
12019         goto reply;
12020       }
12021       new_pg_temp.push_back(osd);
12022     }
12023
12024     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12025     if ((int)new_pg_temp.size() < pool_min_size) {
12026       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
12027          << pool_min_size << ")";
12028       err = -EINVAL;
12029       goto reply;
12030     }
12031
12032     int pool_size = osdmap.get_pg_pool_size(pgid);
12033     if ((int)new_pg_temp.size() > pool_size) {
12034       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
12035          << pool_size << ")";
12036       err = -EINVAL;
12037       goto reply;
12038     }
12039
12040     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
12041       new_pg_temp.begin(), new_pg_temp.end());
12042     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
12043     goto update;
12044   } else if (prefix == "osd primary-temp") {
12045     string pgidstr;
12046     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12047       ss << "unable to parse 'pgid' value '"
12048          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12049       err = -EINVAL;
12050       goto reply;
12051     }
12052     pg_t pgid;
12053     if (!pgid.parse(pgidstr.c_str())) {
12054       ss << "invalid pgid '" << pgidstr << "'";
12055       err = -EINVAL;
12056       goto reply;
12057     }
12058     if (!osdmap.pg_exists(pgid)) {
12059       ss << "pg " << pgid << " does not exist";
12060       err = -ENOENT;
12061       goto reply;
12062     }
12063
12064     int64_t osd;
12065     if (!cmd_getval(cmdmap, "id", osd)) {
12066       ss << "unable to parse 'id' value '"
12067          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12068       err = -EINVAL;
12069       goto reply;
12070     }
12071     if (osd != -1 && !osdmap.exists(osd)) {
12072       ss << "osd." << osd << " does not exist";
12073       err = -ENOENT;
12074       goto reply;
12075     }
12076
12077     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12078         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12079       ss << "require_min_compat_client "
12080          << osdmap.require_min_compat_client
12081          << " < firefly, which is required for primary-temp";
12082       err = -EPERM;
12083       goto reply;
12084     }
12085
12086     pending_inc.new_primary_temp[pgid] = osd;
12087     ss << "set " << pgid << " primary_temp mapping to " << osd;
12088     goto update;
12089   } else if (prefix == "pg repeer") {
12090     pg_t pgid;
12091     string pgidstr;
12092     cmd_getval(cmdmap, "pgid", pgidstr);
12093     if (!pgid.parse(pgidstr.c_str())) {
12094       ss << "invalid pgid '" << pgidstr << "'";
12095       err = -EINVAL;
12096       goto reply;
12097     }
12098     if (!osdmap.pg_exists(pgid)) {
12099       ss << "pg '" << pgidstr << "' does not exist";
12100       err = -ENOENT;
12101       goto reply;
12102     }
12103     vector<int> acting;
12104     int primary;
12105     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
12106     if (primary < 0) {
12107       err = -EAGAIN;
12108       ss << "pg currently has no primary";
12109       goto reply;
12110     }
12111     if (acting.size() > 1) {
12112       // map to just primary; it will map back to what it wants
12113       pending_inc.new_pg_temp[pgid] = { primary };
12114     } else {
12115       // hmm, pick another arbitrary osd to induce a change.  Note
12116       // that this won't work if there is only one suitable OSD in the cluster.
12117       int i;
12118       bool done = false;
12119       for (i = 0; i < osdmap.get_max_osd(); ++i) {
12120         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
12121           continue;
12122         }
12123         pending_inc.new_pg_temp[pgid] = { primary, i };
12124         done = true;
12125         break;
12126       }
12127       if (!done) {
12128         err = -EAGAIN;
12129         ss << "not enough up OSDs in the cluster to force repeer";
12130         goto reply;
12131       }
12132     }
12133     goto update;
12134   } else if (prefix == "osd pg-upmap" ||
12135              prefix == "osd rm-pg-upmap" ||
12136              prefix == "osd pg-upmap-items" ||
12137              prefix == "osd rm-pg-upmap-items") {
12138     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
12139       ss << "min_compat_client "
12140          << osdmap.require_min_compat_client
12141          << " < luminous, which is required for pg-upmap. "
12142          << "Try 'ceph osd set-require-min-compat-client luminous' "
12143          << "before using the new interface";
12144       err = -EPERM;
12145       goto reply;
12146     }
12147     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
12148     if (err == -EAGAIN)
12149       goto wait;
12150     if (err < 0)
12151       goto reply;
12152     string pgidstr;
12153     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
12154       ss << "unable to parse 'pgid' value '"
12155          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
12156       err = -EINVAL;
12157       goto reply;
12158     }
12159     pg_t pgid;
12160     if (!pgid.parse(pgidstr.c_str())) {
12161       ss << "invalid pgid '" << pgidstr << "'";
12162       err = -EINVAL;
12163       goto reply;
12164     }
12165     if (!osdmap.pg_exists(pgid)) {
12166       ss << "pg " << pgid << " does not exist";
12167       err = -ENOENT;
12168       goto reply;
12169     }
12170     if (pending_inc.old_pools.count(pgid.pool())) {
12171       ss << "pool of " << pgid << " is pending removal";
12172       err = -ENOENT;
12173       getline(ss, rs);
12174       wait_for_finished_proposal(op,
12175         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12176       return true;
12177     }
12178
12179     enum {
12180       OP_PG_UPMAP,
12181       OP_RM_PG_UPMAP,
12182       OP_PG_UPMAP_ITEMS,
12183       OP_RM_PG_UPMAP_ITEMS,
12184     } option;
12185
12186     if (prefix == "osd pg-upmap") {
12187       option = OP_PG_UPMAP;
12188     } else if (prefix == "osd rm-pg-upmap") {
12189       option = OP_RM_PG_UPMAP;
12190     } else if (prefix == "osd pg-upmap-items") {
12191       option = OP_PG_UPMAP_ITEMS;
12192     } else {
12193       option = OP_RM_PG_UPMAP_ITEMS;
12194     }
12195
12196     // check pending upmap changes
12197     switch (option) {
12198     case OP_PG_UPMAP: // fall through
12199     case OP_RM_PG_UPMAP:
12200       if (pending_inc.new_pg_upmap.count(pgid) ||
12201           pending_inc.old_pg_upmap.count(pgid)) {
12202         dout(10) << __func__ << " waiting for pending update on "
12203                  << pgid << dendl;
12204         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12205         return true;
12206       }
12207       break;
12208
12209     case OP_PG_UPMAP_ITEMS: // fall through
12210     case OP_RM_PG_UPMAP_ITEMS:
12211       if (pending_inc.new_pg_upmap_items.count(pgid) ||
12212           pending_inc.old_pg_upmap_items.count(pgid)) {
12213         dout(10) << __func__ << " waiting for pending update on "
12214                  << pgid << dendl;
12215         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12216         return true;
12217       }
12218       break;
12219
12220     default:
12221       ceph_abort_msg("invalid option");
12222     }
12223
12224     switch (option) {
12225     case OP_PG_UPMAP:
12226       {
12227         vector<int64_t> id_vec;
12228         if (!cmd_getval(cmdmap, "id", id_vec)) {
12229           ss << "unable to parse 'id' value(s) '"
12230              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12231           err = -EINVAL;
12232           goto reply;
12233         }
12234
12235         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12236         if ((int)id_vec.size() < pool_min_size) {
12237           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12238              << pool_min_size << ")";
12239           err = -EINVAL;
12240           goto reply;
12241         }
12242
12243         int pool_size = osdmap.get_pg_pool_size(pgid);
12244         if ((int)id_vec.size() > pool_size) {
12245           ss << "num of osds (" << id_vec.size() <<") > pool size ("
12246              << pool_size << ")";
12247           err = -EINVAL;
12248           goto reply;
12249         }
12250
12251         vector<int32_t> new_pg_upmap;
12252         for (auto osd : id_vec) {
12253           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12254             ss << "osd." << osd << " does not exist";
12255             err = -ENOENT;
12256             goto reply;
12257           }
12258           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12259           if (it != new_pg_upmap.end()) {
12260             ss << "osd." << osd << " already exists, ";
12261             continue;
12262           }
12263           new_pg_upmap.push_back(osd);
12264         }
12265
12266         if (new_pg_upmap.empty()) {
12267           ss << "no valid upmap items(pairs) is specified";
12268           err = -EINVAL;
12269           goto reply;
12270         }
12271
12272         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12273           new_pg_upmap.begin(), new_pg_upmap.end());
12274         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12275       }
12276       break;
12277
12278     case OP_RM_PG_UPMAP:
12279       {
12280         pending_inc.old_pg_upmap.insert(pgid);
12281         ss << "clear " << pgid << " pg_upmap mapping";
12282       }
12283       break;
12284
12285     case OP_PG_UPMAP_ITEMS:
12286       {
12287         vector<int64_t> id_vec;
12288         if (!cmd_getval(cmdmap, "id", id_vec)) {
12289           ss << "unable to parse 'id' value(s) '"
12290              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12291           err = -EINVAL;
12292           goto reply;
12293         }
12294
12295         if (id_vec.size() % 2) {
12296           ss << "you must specify pairs of osd ids to be remapped";
12297           err = -EINVAL;
12298           goto reply;
12299         }
12300
12301         int pool_size = osdmap.get_pg_pool_size(pgid);
12302         if ((int)(id_vec.size() / 2) > pool_size) {
12303           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12304              << pool_size << ")";
12305           err = -EINVAL;
12306           goto reply;
12307         }
12308
12309         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12310         ostringstream items;
12311         items << "[";
12312         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12313           int from = *p++;
12314           int to = *p;
12315           if (from == to) {
12316             ss << "from osd." << from << " == to osd." << to << ", ";
12317             continue;
12318           }
12319           if (!osdmap.exists(from)) {
12320             ss << "osd." << from << " does not exist";
12321             err = -ENOENT;
12322             goto reply;
12323           }
12324           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12325             ss << "osd." << to << " does not exist";
12326             err = -ENOENT;
12327             goto reply;
12328           }
12329           pair<int32_t,int32_t> entry = make_pair(from, to);
12330           auto it = std::find(new_pg_upmap_items.begin(),
12331             new_pg_upmap_items.end(), entry);
12332           if (it != new_pg_upmap_items.end()) {
12333             ss << "osd." << from << " -> osd." << to << " already exists, ";
12334             continue;
12335           }
12336           new_pg_upmap_items.push_back(entry);
12337           items << from << "->" << to << ",";
12338         }
12339         string out(items.str());
12340         out.resize(out.size() - 1); // drop last ','
12341         out += "]";
12342
12343         if (new_pg_upmap_items.empty()) {
12344           ss << "no valid upmap items(pairs) is specified";
12345           err = -EINVAL;
12346           goto reply;
12347         }
12348
12349         pending_inc.new_pg_upmap_items[pgid] =
12350           mempool::osdmap::vector<pair<int32_t,int32_t>>(
12351           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12352         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12353       }
12354       break;
12355
12356     case OP_RM_PG_UPMAP_ITEMS:
12357       {
12358         pending_inc.old_pg_upmap_items.insert(pgid);
12359         ss << "clear " << pgid << " pg_upmap_items mapping";
12360       }
12361       break;
12362
12363     default:
12364       ceph_abort_msg("invalid option");
12365     }
12366
12367     goto update;
12368   } else if (prefix == "osd primary-affinity") {
12369     int64_t id;
12370     if (!cmd_getval(cmdmap, "id", id)) {
12371       ss << "invalid osd id value '"
12372          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12373       err = -EINVAL;
12374       goto reply;
12375     }
12376     double w;
12377     if (!cmd_getval(cmdmap, "weight", w)) {
12378       ss << "unable to parse 'weight' value '"
12379          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12380       err = -EINVAL;
12381       goto reply;
12382     }
12383     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12384     if (ww < 0L) {
12385       ss << "weight must be >= 0";
12386       err = -EINVAL;
12387       goto reply;
12388     }
12389     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12390         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12391       ss << "require_min_compat_client "
12392          << osdmap.require_min_compat_client
12393          << " < firefly, which is required for primary-affinity";
12394       err = -EPERM;
12395       goto reply;
12396     }
12397     if (osdmap.exists(id)) {
12398       pending_inc.new_primary_affinity[id] = ww;
12399       ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12400       getline(ss, rs);
12401       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12402                                                 get_last_committed() + 1));
12403       return true;
12404     } else {
12405       ss << "osd." << id << " does not exist";
12406       err = -ENOENT;
12407       goto reply;
12408     }
12409   } else if (prefix == "osd reweight") {
12410     int64_t id;
12411     if (!cmd_getval(cmdmap, "id", id)) {
12412       ss << "unable to parse osd id value '"
12413          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12414       err = -EINVAL;
12415       goto reply;
12416     }
12417     double w;
12418     if (!cmd_getval(cmdmap, "weight", w)) {
12419       ss << "unable to parse weight value '"
12420          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12421       err = -EINVAL;
12422       goto reply;
12423     }
12424     long ww = (int)((double)CEPH_OSD_IN*w);
12425     if (ww < 0L) {
12426       ss << "weight must be >= 0";
12427       err = -EINVAL;
12428       goto reply;
12429     }
12430     if (osdmap.exists(id)) {
12431       pending_inc.new_weight[id] = ww;
12432       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12433       getline(ss, rs);
12434       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12435                                                 get_last_committed() + 1));
12436       return true;
12437     } else {
12438       ss << "osd." << id << " does not exist";
12439       err = -ENOENT;
12440       goto reply;
12441     }
12442   } else if (prefix == "osd reweightn") {
12443     map<int32_t, uint32_t> weights;
12444     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12445     if (err) {
12446       ss << "unable to parse 'weights' value '"
12447          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12448       goto reply;
12449     }
12450     pending_inc.new_weight.insert(weights.begin(), weights.end());
12451     wait_for_finished_proposal(
12452         op,
12453         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12454     return true;
12455   } else if (prefix == "osd lost") {
12456     int64_t id;
12457     if (!cmd_getval(cmdmap, "id", id)) {
12458       ss << "unable to parse osd id value '"
12459          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12460       err = -EINVAL;
12461       goto reply;
12462     }
12463     bool sure = false;
12464     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12465     if (!sure) {
12466       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12467             "--yes-i-really-mean-it if you really do.";
12468       err = -EPERM;
12469       goto reply;
12470     } else if (!osdmap.exists(id)) {
12471       ss << "osd." << id << " does not exist";
12472       err = -ENOENT;
12473       goto reply;
12474     } else if (!osdmap.is_down(id)) {
12475       ss << "osd." << id << " is not down";
12476       err = -EBUSY;
12477       goto reply;
12478     } else {
12479       epoch_t e = osdmap.get_info(id).down_at;
12480       pending_inc.new_lost[id] = e;
12481       ss << "marked osd lost in epoch " << e;
12482       getline(ss, rs);
12483       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12484                                                 get_last_committed() + 1));
12485       return true;
12486     }
12487
12488   } else if (prefix == "osd destroy-actual" ||
12489              prefix == "osd purge-actual" ||
12490              prefix == "osd purge-new") {
12491     /* Destroying an OSD means that we don't expect to further make use of
12492      * the OSDs data (which may even become unreadable after this operation),
12493      * and that we are okay with scrubbing all its cephx keys and config-key
12494      * data (which may include lockbox keys, thus rendering the osd's data
12495      * unreadable).
12496      *
12497      * The OSD will not be removed. Instead, we will mark it as destroyed,
12498      * such that a subsequent call to `create` will not reuse the osd id.
12499      * This will play into being able to recreate the OSD, at the same
12500      * crush location, with minimal data movement.
12501      */
12502
12503     // make sure authmon is writeable.
12504     if (!mon.authmon()->is_writeable()) {
12505       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12506                << "osd destroy" << dendl;
12507       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12508       return false;
12509     }
12510
12511     int64_t id;
12512     if (!cmd_getval(cmdmap, "id", id)) {
12513       auto p = cmdmap.find("id");
12514       if (p == cmdmap.end()) {
12515         ss << "no osd id specified";
12516       } else {
12517         ss << "unable to parse osd id value '"
12518            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12519       }
12520       err = -EINVAL;
12521       goto reply;
12522     }
12523
12524     bool is_destroy = (prefix == "osd destroy-actual");
12525     if (!is_destroy) {
12526       ceph_assert("osd purge-actual" == prefix ||
12527              "osd purge-new" == prefix);
12528     }
12529
12530     bool sure = false;
12531     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12532     if (!sure) {
12533       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12534          << "This will mean real, permanent data loss, as well "
12535          << "as deletion of cephx and lockbox keys. "
12536          << "Pass --yes-i-really-mean-it if you really do.";
12537       err = -EPERM;
12538       goto reply;
12539     } else if (!osdmap.exists(id)) {
12540       ss << "osd." << id << " does not exist";
12541       err = 0; // idempotent
12542       goto reply;
12543     } else if (osdmap.is_up(id)) {
12544       ss << "osd." << id << " is not `down`.";
12545       err = -EBUSY;
12546       goto reply;
12547     } else if (is_destroy && osdmap.is_destroyed(id)) {
12548       ss << "destroyed osd." << id;
12549       err = 0;
12550       goto reply;
12551     }
12552
12553     if (prefix == "osd purge-new" &&
12554         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12555       ss << "osd." << id << " is not new";
12556       err = -EPERM;
12557       goto reply;
12558     }
12559
12560     bool goto_reply = false;
12561
12562     paxos.plug();
12563     if (is_destroy) {
12564       err = prepare_command_osd_destroy(id, ss);
12565       // we checked above that it should exist.
12566       ceph_assert(err != -ENOENT);
12567     } else {
12568       err = prepare_command_osd_purge(id, ss);
12569       if (err == -ENOENT) {
12570         err = 0;
12571         ss << "osd." << id << " does not exist.";
12572         goto_reply = true;
12573       }
12574     }
12575     paxos.unplug();
12576
12577     if (err < 0 || goto_reply) {
12578       goto reply;
12579     }
12580
12581     if (is_destroy) {
12582       ss << "destroyed osd." << id;
12583     } else {
12584       ss << "purged osd." << id;
12585     }
12586
12587     getline(ss, rs);
12588     wait_for_finished_proposal(op,
12589         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12590     force_immediate_propose();
12591     return true;
12592
12593   } else if (prefix == "osd new") {
12594
12595     // make sure authmon is writeable.
12596     if (!mon.authmon()->is_writeable()) {
12597       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12598                << "osd new" << dendl;
12599       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12600       return false;
12601     }
12602
12603     // make sure kvmon is writeable.
12604     if (!mon.kvmon()->is_writeable()) {
12605       dout(10) << __func__ << " waiting for kv mon to be writeable for "
12606                << "osd new" << dendl;
12607       mon.kvmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12608       return false;
12609     }
12610
12611     map<string,string> param_map;
12612
12613     bufferlist bl = m->get_data();
12614     string param_json = bl.to_str();
12615     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12616
12617     err = get_json_str_map(param_json, ss, &param_map);
12618     if (err < 0)
12619       goto reply;
12620
12621     dout(20) << __func__ << " osd new params " << param_map << dendl;
12622
12623     paxos.plug();
12624     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12625     paxos.unplug();
12626
12627     if (err < 0) {
12628       goto reply;
12629     }
12630
12631     if (f) {
12632       f->flush(rdata);
12633     } else {
12634       rdata.append(ss);
12635     }
12636
12637     if (err == EEXIST) {
12638       // idempotent operation
12639       err = 0;
12640       goto reply;
12641     }
12642
12643     wait_for_finished_proposal(op,
12644         new Monitor::C_Command(mon, op, 0, rs, rdata,
12645                                get_last_committed() + 1));
12646     force_immediate_propose();
12647     return true;
12648
12649   } else if (prefix == "osd create") {
12650
12651     // optional id provided?
12652     int64_t id = -1, cmd_id = -1;
12653     if (cmd_getval(cmdmap, "id", cmd_id)) {
12654       if (cmd_id < 0) {
12655         ss << "invalid osd id value '" << cmd_id << "'";
12656         err = -EINVAL;
12657         goto reply;
12658       }
12659       dout(10) << " osd create got id " << cmd_id << dendl;
12660     }
12661
12662     uuid_d uuid;
12663     string uuidstr;
12664     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12665       if (!uuid.parse(uuidstr.c_str())) {
12666         ss << "invalid uuid value '" << uuidstr << "'";
12667         err = -EINVAL;
12668         goto reply;
12669       }
12670       // we only care about the id if we also have the uuid, to
12671       // ensure the operation's idempotency.
12672       id = cmd_id;
12673     }
12674
12675     int32_t new_id = -1;
12676     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12677     if (err < 0) {
12678       if (err == -EAGAIN) {
12679         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12680         return true;
12681       }
12682       // a check has failed; reply to the user.
12683       goto reply;
12684
12685     } else if (err == EEXIST) {
12686       // this is an idempotent operation; we can go ahead and reply.
12687       if (f) {
12688         f->open_object_section("created_osd");
12689         f->dump_int("osdid", new_id);
12690         f->close_section();
12691         f->flush(rdata);
12692       } else {
12693         ss << new_id;
12694         rdata.append(ss);
12695       }
12696       err = 0;
12697       goto reply;
12698     }
12699
12700     string empty_device_class;
12701     do_osd_create(id, uuid, empty_device_class, &new_id);
12702
12703     if (f) {
12704       f->open_object_section("created_osd");
12705       f->dump_int("osdid", new_id);
12706       f->close_section();
12707       f->flush(rdata);
12708     } else {
12709       ss << new_id;
12710       rdata.append(ss);
12711     }
12712     wait_for_finished_proposal(op,
12713         new Monitor::C_Command(mon, op, 0, rs, rdata,
12714                                get_last_committed() + 1));
12715     return true;
12716
12717   } else if (prefix == "osd blocklist clear" ||
12718              prefix == "osd blacklist clear") {
12719     pending_inc.new_blocklist.clear();
12720     std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12721     std::list<std::pair<entity_addr_t,utime_t > > range_b;
12722     osdmap.get_blocklist(&blocklist, &range_b);
12723     for (const auto &entry : blocklist) {
12724       pending_inc.old_blocklist.push_back(entry.first);
12725     }
12726     for (const auto &entry : range_b) {
12727       pending_inc.old_range_blocklist.push_back(entry.first);
12728     }
12729     ss << " removed all blocklist entries";
12730     getline(ss, rs);
12731     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12732                                               get_last_committed() + 1));
12733     return true;
12734   } else if (prefix == "osd blocklist" ||
12735              prefix == "osd blacklist") {
12736     string addrstr, rangestr;
12737     bool range = false;
12738     cmd_getval(cmdmap, "addr", addrstr);
12739     if (cmd_getval(cmdmap, "range", rangestr)) {
12740       if (rangestr == "range") {
12741         range = true;
12742       } else {
12743         ss << "Did you mean to specify \"osd blocklist range\"?";
12744         err = -EINVAL;
12745         goto reply;
12746       }
12747     }
12748     entity_addr_t addr;
12749     if (!addr.parse(addrstr)) {
12750       ss << "unable to parse address " << addrstr;
12751       err = -EINVAL;
12752       goto reply;
12753     }
12754     else {
12755       if (range) {
12756         if (!addr.maybe_cidr()) {
12757           ss << "You specified a range command, but " << addr
12758              << " does not parse as a CIDR range";
12759           err = -EINVAL;
12760           goto reply;
12761         }
12762         addr.type = entity_addr_t::TYPE_CIDR;
12763         err = check_cluster_features(CEPH_FEATUREMASK_RANGE_BLOCKLIST, ss);
12764         if (err) {
12765           goto reply;
12766         }
12767         if ((addr.is_ipv4() && addr.get_nonce() > 32) ||
12768             (addr.is_ipv6() && addr.get_nonce() > 128)) {
12769           ss << "Too many bits in range for that protocol!";
12770           err = -EINVAL;
12771           goto reply;
12772         }
12773       } else {
12774         if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12775           // always blocklist type ANY
12776           addr.set_type(entity_addr_t::TYPE_ANY);
12777         } else {
12778           addr.set_type(entity_addr_t::TYPE_LEGACY);
12779         }
12780       }
12781
12782       string blocklistop;
12783       if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12784         cmd_getval(cmdmap, "blacklistop", blocklistop);
12785       }
12786       if (blocklistop == "add") {
12787         utime_t expires = ceph_clock_now();
12788         // default one hour
12789         double d = cmd_getval_or<double>(cmdmap, "expire",
12790           g_conf()->mon_osd_blocklist_default_expire);
12791         expires += d;
12792
12793         auto add_to_pending_blocklists = [](auto& nb, auto& ob,
12794                                             const auto& addr,
12795                                             const auto& expires) {
12796           nb[addr] = expires;
12797           // cancel any pending un-blocklisting request too
12798           auto it = std::find(ob.begin(),
12799                               ob.end(), addr);
12800           if (it != ob.end()) {
12801             ob.erase(it);
12802           }
12803         };
12804         if (range) {
12805           add_to_pending_blocklists(pending_inc.new_range_blocklist,
12806                                     pending_inc.old_range_blocklist,
12807                                     addr, expires);
12808
12809         } else {
12810           add_to_pending_blocklists(pending_inc.new_blocklist,
12811                                     pending_inc.old_blocklist,
12812                                     addr, expires);
12813         }
12814
12815         ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12816         getline(ss, rs);
12817         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12818                                                   get_last_committed() + 1));
12819         return true;
12820       } else if (blocklistop == "rm") {
12821         auto rm_from_pending_blocklists = [](const auto& addr,
12822                                              auto& blocklist,
12823                                              auto& ob, auto& pb) {
12824           if (blocklist.count(addr)) {
12825             ob.push_back(addr);
12826             return true;
12827           } else if (pb.count(addr)) {
12828             pb.erase(addr);
12829             return true;
12830           }
12831           return false;
12832         };
12833         if ((!range && rm_from_pending_blocklists(addr, osdmap.blocklist,
12834                                                   pending_inc.old_blocklist,
12835                                                   pending_inc.new_blocklist)) ||
12836             (range && rm_from_pending_blocklists(addr, osdmap.range_blocklist,
12837                                                  pending_inc.old_range_blocklist,
12838                                                  pending_inc.new_range_blocklist))) {
12839           ss << "un-blocklisting " << addr;
12840           getline(ss, rs);
12841           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12842                                                     get_last_committed() + 1));
12843           return true;
12844         }
12845         ss << addr << " isn't blocklisted";
12846         err = 0;
12847         goto reply;
12848       }
12849     }
12850   } else if (prefix == "osd pool mksnap") {
12851     string poolstr;
12852     cmd_getval(cmdmap, "pool", poolstr);
12853     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12854     if (pool < 0) {
12855       ss << "unrecognized pool '" << poolstr << "'";
12856       err = -ENOENT;
12857       goto reply;
12858     }
12859     string snapname;
12860     cmd_getval(cmdmap, "snap", snapname);
12861     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12862     if (p->is_unmanaged_snaps_mode()) {
12863       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12864       err = -EINVAL;
12865       goto reply;
12866     } else if (p->snap_exists(snapname.c_str())) {
12867       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12868       err = 0;
12869       goto reply;
12870     } else if (p->is_tier()) {
12871       ss << "pool " << poolstr << " is a cache tier";
12872       err = -EINVAL;
12873       goto reply;
12874     }
12875     pg_pool_t *pp = 0;
12876     if (pending_inc.new_pools.count(pool))
12877       pp = &pending_inc.new_pools[pool];
12878     if (!pp) {
12879       pp = &pending_inc.new_pools[pool];
12880       *pp = *p;
12881     }
12882     if (pp->snap_exists(snapname.c_str())) {
12883       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12884     } else {
12885       pp->add_snap(snapname.c_str(), ceph_clock_now());
12886       pp->set_snap_epoch(pending_inc.epoch);
12887       ss << "created pool " << poolstr << " snap " << snapname;
12888     }
12889     getline(ss, rs);
12890     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12891                                               get_last_committed() + 1));
12892     return true;
12893   } else if (prefix == "osd pool rmsnap") {
12894     string poolstr;
12895     cmd_getval(cmdmap, "pool", poolstr);
12896     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12897     if (pool < 0) {
12898       ss << "unrecognized pool '" << poolstr << "'";
12899       err = -ENOENT;
12900       goto reply;
12901     }
12902     string snapname;
12903     cmd_getval(cmdmap, "snap", snapname);
12904     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12905     if (p->is_unmanaged_snaps_mode()) {
12906       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12907       err = -EINVAL;
12908       goto reply;
12909     } else if (!p->snap_exists(snapname.c_str())) {
12910       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12911       err = 0;
12912       goto reply;
12913     }
12914     pg_pool_t *pp = 0;
12915     if (pending_inc.new_pools.count(pool))
12916       pp = &pending_inc.new_pools[pool];
12917     if (!pp) {
12918       pp = &pending_inc.new_pools[pool];
12919       *pp = *p;
12920     }
12921     snapid_t sn = pp->snap_exists(snapname.c_str());
12922     if (sn) {
12923       pp->remove_snap(sn);
12924       pp->set_snap_epoch(pending_inc.epoch);
12925       ss << "removed pool " << poolstr << " snap " << snapname;
12926     } else {
12927       ss << "already removed pool " << poolstr << " snap " << snapname;
12928     }
12929     getline(ss, rs);
12930     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12931                                               get_last_committed() + 1));
12932     return true;
12933   } else if (prefix == "osd pool create") {
12934     int64_t pg_num = cmd_getval_or<int64_t>(cmdmap, "pg_num", 0);
12935     int64_t pg_num_min = cmd_getval_or<int64_t>(cmdmap, "pg_num_min", 0);
12936     int64_t pg_num_max = cmd_getval_or<int64_t>(cmdmap, "pg_num_max", 0);
12937     int64_t pgp_num = cmd_getval_or<int64_t>(cmdmap, "pgp_num", pg_num);
12938     string pool_type_str;
12939     cmd_getval(cmdmap, "pool_type", pool_type_str);
12940     if (pool_type_str.empty())
12941       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12942
12943     string poolstr;
12944     cmd_getval(cmdmap, "pool", poolstr);
12945     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12946     if (pool_id >= 0) {
12947       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12948       if (pool_type_str != p->get_type_name()) {
12949         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12950         err = -EINVAL;
12951       } else {
12952         ss << "pool '" << poolstr << "' already exists";
12953         err = 0;
12954       }
12955       goto reply;
12956     }
12957
12958     int pool_type;
12959     if (pool_type_str == "replicated") {
12960       pool_type = pg_pool_t::TYPE_REPLICATED;
12961     } else if (pool_type_str == "erasure") {
12962       pool_type = pg_pool_t::TYPE_ERASURE;
12963     } else {
12964       ss << "unknown pool type '" << pool_type_str << "'";
12965       err = -EINVAL;
12966       goto reply;
12967     }
12968
12969     bool implicit_rule_creation = false;
12970     int64_t expected_num_objects = 0;
12971     string rule_name;
12972     cmd_getval(cmdmap, "rule", rule_name);
12973     string erasure_code_profile;
12974     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12975
12976     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12977       if (erasure_code_profile == "")
12978         erasure_code_profile = "default";
12979       //handle the erasure code profile
12980       if (erasure_code_profile == "default") {
12981         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12982           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12983             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12984             goto wait;
12985           }
12986
12987           map<string,string> profile_map;
12988           err = osdmap.get_erasure_code_profile_default(cct,
12989                                                       profile_map,
12990                                                       &ss);
12991           if (err)
12992             goto reply;
12993           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12994           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12995           goto wait;
12996         }
12997       }
12998       if (rule_name == "") {
12999         implicit_rule_creation = true;
13000         if (erasure_code_profile == "default") {
13001           rule_name = "erasure-code";
13002         } else {
13003           dout(1) << "implicitly use rule named after the pool: "
13004                 << poolstr << dendl;
13005           rule_name = poolstr;
13006         }
13007       }
13008       expected_num_objects =
13009         cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13010     } else {
13011       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
13012       //     and put expected_num_objects to rule field
13013       if (erasure_code_profile != "") { // cmd is from CLI
13014         if (rule_name != "") {
13015           string interr;
13016           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
13017           if (interr.length()) {
13018             ss << "error parsing integer value '" << rule_name << "': " << interr;
13019             err = -EINVAL;
13020             goto reply;
13021           }
13022         }
13023         rule_name = erasure_code_profile;
13024       } else { // cmd is well-formed
13025         expected_num_objects =
13026           cmd_getval_or<int64_t>(cmdmap, "expected_num_objects", 0);
13027       }
13028     }
13029
13030     if (!implicit_rule_creation && rule_name != "") {
13031       int rule;
13032       err = get_crush_rule(rule_name, &rule, &ss);
13033       if (err == -EAGAIN) {
13034         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13035         return true;
13036       }
13037       if (err)
13038         goto reply;
13039     }
13040
13041     if (expected_num_objects < 0) {
13042       ss << "'expected_num_objects' must be non-negative";
13043       err = -EINVAL;
13044       goto reply;
13045     }
13046
13047     set<int32_t> osds;
13048     osdmap.get_all_osds(osds);
13049     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
13050       string type;
13051       if (!get_osd_objectstore_type(osd, &type)) {
13052         return type == "filestore";
13053       } else {
13054         return false;
13055       }
13056     });
13057
13058     if (has_filestore_osd &&
13059         expected_num_objects > 0 &&
13060         cct->_conf->filestore_merge_threshold > 0) {
13061       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
13062       err = -EINVAL;
13063       goto reply;
13064     }
13065
13066     if (has_filestore_osd &&
13067         expected_num_objects == 0 &&
13068         cct->_conf->filestore_merge_threshold < 0) {
13069       int osds = osdmap.get_num_osds();
13070       bool sure = false;
13071       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13072       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
13073         ss << "For better initial performance on pools expected to store a "
13074            << "large number of objects, consider supplying the "
13075            << "expected_num_objects parameter when creating the pool."
13076            << " Pass --yes-i-really-mean-it to ignore it";
13077         err = -EPERM;
13078         goto reply;
13079       }
13080     }
13081
13082     int64_t fast_read_param = cmd_getval_or<int64_t>(cmdmap, "fast_read", -1);
13083     FastReadType fast_read = FAST_READ_DEFAULT;
13084     if (fast_read_param == 0)
13085       fast_read = FAST_READ_OFF;
13086     else if (fast_read_param > 0)
13087       fast_read = FAST_READ_ON;
13088
13089     int64_t repl_size = 0;
13090     cmd_getval(cmdmap, "size", repl_size);
13091     int64_t target_size_bytes = 0;
13092     double target_size_ratio = 0.0;
13093     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
13094     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
13095
13096     string pg_autoscale_mode;
13097     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
13098
13099     bool bulk = cmd_getval_or<bool>(cmdmap, "bulk", 0);
13100     err = prepare_new_pool(poolstr,
13101                            -1, // default crush rule
13102                            rule_name,
13103                            pg_num, pgp_num, pg_num_min, pg_num_max,
13104                            repl_size, target_size_bytes, target_size_ratio,
13105                            erasure_code_profile, pool_type,
13106                            (uint64_t)expected_num_objects,
13107                            fast_read,
13108                            pg_autoscale_mode,
13109                            bulk,
13110                            &ss);
13111     if (err < 0) {
13112       switch(err) {
13113       case -EEXIST:
13114         ss << "pool '" << poolstr << "' already exists";
13115         break;
13116       case -EAGAIN:
13117         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13118         return true;
13119       case -ERANGE:
13120         goto reply;
13121       default:
13122         goto reply;
13123         break;
13124       }
13125     } else {
13126       ss << "pool '" << poolstr << "' created";
13127     }
13128     getline(ss, rs);
13129     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13130                                               get_last_committed() + 1));
13131     return true;
13132
13133   } else if (prefix == "osd pool delete" ||
13134              prefix == "osd pool rm") {
13135     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
13136     string poolstr, poolstr2, sure;
13137     cmd_getval(cmdmap, "pool", poolstr);
13138     cmd_getval(cmdmap, "pool2", poolstr2);
13139     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
13140     if (pool < 0) {
13141       ss << "pool '" << poolstr << "' does not exist";
13142       err = 0;
13143       goto reply;
13144     }
13145
13146     bool force_no_fake = false;
13147     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
13148     bool force = false;
13149     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
13150     if (poolstr2 != poolstr ||
13151         (!force && !force_no_fake)) {
13152       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
13153          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
13154          << "followed by --yes-i-really-really-mean-it.";
13155       err = -EPERM;
13156       goto reply;
13157     }
13158     err = _prepare_remove_pool(pool, &ss, force_no_fake);
13159     if (err == -EAGAIN) {
13160       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13161       return true;
13162     }
13163     if (err < 0)
13164       goto reply;
13165     goto update;
13166   } else if (prefix == "osd pool rename") {
13167     string srcpoolstr, destpoolstr;
13168     cmd_getval(cmdmap, "srcpool", srcpoolstr);
13169     cmd_getval(cmdmap, "destpool", destpoolstr);
13170     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
13171     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
13172
13173     if (pool_src < 0) {
13174       if (pool_dst >= 0) {
13175         // src pool doesn't exist, dst pool does exist: to ensure idempotency
13176         // of operations, assume this rename succeeded, as it is not changing
13177         // the current state.  Make sure we output something understandable
13178         // for whoever is issuing the command, if they are paying attention,
13179         // in case it was not intentional; or to avoid a "wtf?" and a bug
13180         // report in case it was intentional, while expecting a failure.
13181         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
13182           << destpoolstr << "' does -- assuming successful rename";
13183         err = 0;
13184       } else {
13185         ss << "unrecognized pool '" << srcpoolstr << "'";
13186         err = -ENOENT;
13187       }
13188       goto reply;
13189     } else if (pool_dst >= 0) {
13190       // source pool exists and so does the destination pool
13191       ss << "pool '" << destpoolstr << "' already exists";
13192       err = -EEXIST;
13193       goto reply;
13194     }
13195
13196     int ret = _prepare_rename_pool(pool_src, destpoolstr);
13197     if (ret == 0) {
13198       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
13199     } else {
13200       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
13201         << cpp_strerror(ret);
13202     }
13203     getline(ss, rs);
13204     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
13205                                               get_last_committed() + 1));
13206     return true;
13207
13208   } else if (prefix == "osd pool set") {
13209     err = prepare_command_pool_set(cmdmap, ss);
13210     if (err == -EAGAIN)
13211       goto wait;
13212     if (err < 0)
13213       goto reply;
13214
13215     getline(ss, rs);
13216     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13217                                                    get_last_committed() + 1));
13218     return true;
13219   } else if (prefix == "osd tier add") {
13220     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13221     if (err == -EAGAIN)
13222       goto wait;
13223     if (err)
13224       goto reply;
13225     string poolstr;
13226     cmd_getval(cmdmap, "pool", poolstr);
13227     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13228     if (pool_id < 0) {
13229       ss << "unrecognized pool '" << poolstr << "'";
13230       err = -ENOENT;
13231       goto reply;
13232     }
13233     string tierpoolstr;
13234     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13235     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13236     if (tierpool_id < 0) {
13237       ss << "unrecognized pool '" << tierpoolstr << "'";
13238       err = -ENOENT;
13239       goto reply;
13240     }
13241     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13242     ceph_assert(p);
13243     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13244     ceph_assert(tp);
13245
13246     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13247       goto reply;
13248     }
13249
13250     // make sure new tier is empty
13251     bool force_nonempty = false;
13252     cmd_getval_compat_cephbool(cmdmap, "force_nonempty", force_nonempty);
13253     const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13254     if (pstats && pstats->stats.sum.num_objects != 0 &&
13255         !force_nonempty) {
13256       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13257       err = -ENOTEMPTY;
13258       goto reply;
13259     }
13260     if (tp->is_erasure()) {
13261       ss << "tier pool '" << tierpoolstr
13262          << "' is an ec pool, which cannot be a tier";
13263       err = -ENOTSUP;
13264       goto reply;
13265     }
13266     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13267         (!force_nonempty ||
13268          !g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps)) {
13269       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13270       err = -ENOTEMPTY;
13271       goto reply;
13272     }
13273     // go
13274     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13275     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13276     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13277       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13278       return true;
13279     }
13280     np->tiers.insert(tierpool_id);
13281     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13282     ntp->tier_of = pool_id;
13283     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13284     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13285                                               get_last_committed() + 1));
13286     return true;
13287   } else if (prefix == "osd tier remove" ||
13288              prefix == "osd tier rm") {
13289     string poolstr;
13290     cmd_getval(cmdmap, "pool", poolstr);
13291     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13292     if (pool_id < 0) {
13293       ss << "unrecognized pool '" << poolstr << "'";
13294       err = -ENOENT;
13295       goto reply;
13296     }
13297     string tierpoolstr;
13298     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13299     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13300     if (tierpool_id < 0) {
13301       ss << "unrecognized pool '" << tierpoolstr << "'";
13302       err = -ENOENT;
13303       goto reply;
13304     }
13305     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13306     ceph_assert(p);
13307     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13308     ceph_assert(tp);
13309
13310     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13311       goto reply;
13312     }
13313
13314     if (p->tiers.count(tierpool_id) == 0) {
13315       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13316       err = 0;
13317       goto reply;
13318     }
13319     if (tp->tier_of != pool_id) {
13320       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13321          << osdmap.get_pool_name(tp->tier_of) << "': "
13322          // be scary about it; this is an inconsistency and bells must go off
13323          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13324       err = -EINVAL;
13325       goto reply;
13326     }
13327     if (p->read_tier == tierpool_id) {
13328       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13329       err = -EBUSY;
13330       goto reply;
13331     }
13332     // go
13333     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13334     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13335     if (np->tiers.count(tierpool_id) == 0 ||
13336         ntp->tier_of != pool_id ||
13337         np->read_tier == tierpool_id) {
13338       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13339       return true;
13340     }
13341     np->tiers.erase(tierpool_id);
13342     ntp->clear_tier();
13343     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13344     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13345                                               get_last_committed() + 1));
13346     return true;
13347   } else if (prefix == "osd tier set-overlay") {
13348     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13349     if (err == -EAGAIN)
13350       goto wait;
13351     if (err)
13352       goto reply;
13353     string poolstr;
13354     cmd_getval(cmdmap, "pool", poolstr);
13355     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13356     if (pool_id < 0) {
13357       ss << "unrecognized pool '" << poolstr << "'";
13358       err = -ENOENT;
13359       goto reply;
13360     }
13361     string overlaypoolstr;
13362     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13363     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13364     if (overlaypool_id < 0) {
13365       ss << "unrecognized pool '" << overlaypoolstr << "'";
13366       err = -ENOENT;
13367       goto reply;
13368     }
13369     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13370     ceph_assert(p);
13371     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13372     ceph_assert(overlay_p);
13373     if (p->tiers.count(overlaypool_id) == 0) {
13374       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13375       err = -EINVAL;
13376       goto reply;
13377     }
13378     if (p->read_tier == overlaypool_id) {
13379       err = 0;
13380       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13381       goto reply;
13382     }
13383     if (p->has_read_tier()) {
13384       ss << "pool '" << poolstr << "' has overlay '"
13385          << osdmap.get_pool_name(p->read_tier)
13386          << "'; please remove-overlay first";
13387       err = -EINVAL;
13388       goto reply;
13389     }
13390
13391     // go
13392     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13393     np->read_tier = overlaypool_id;
13394     np->write_tier = overlaypool_id;
13395     np->set_last_force_op_resend(pending_inc.epoch);
13396     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13397     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13398     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13399     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13400       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13401     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13402                                               get_last_committed() + 1));
13403     return true;
13404   } else if (prefix == "osd tier remove-overlay" ||
13405              prefix == "osd tier rm-overlay") {
13406     string poolstr;
13407     cmd_getval(cmdmap, "pool", poolstr);
13408     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13409     if (pool_id < 0) {
13410       ss << "unrecognized pool '" << poolstr << "'";
13411       err = -ENOENT;
13412       goto reply;
13413     }
13414     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13415     ceph_assert(p);
13416     if (!p->has_read_tier()) {
13417       err = 0;
13418       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13419       goto reply;
13420     }
13421
13422     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13423       goto reply;
13424     }
13425
13426     // go
13427     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13428     if (np->has_read_tier()) {
13429       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13430       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13431       nop->set_last_force_op_resend(pending_inc.epoch);
13432     }
13433     if (np->has_write_tier()) {
13434       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13435       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13436       nop->set_last_force_op_resend(pending_inc.epoch);
13437     }
13438     np->clear_read_tier();
13439     np->clear_write_tier();
13440     np->set_last_force_op_resend(pending_inc.epoch);
13441     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13442     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13443                                               get_last_committed() + 1));
13444     return true;
13445   } else if (prefix == "osd tier cache-mode") {
13446     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13447     if (err == -EAGAIN)
13448       goto wait;
13449     if (err)
13450       goto reply;
13451     string poolstr;
13452     cmd_getval(cmdmap, "pool", poolstr);
13453     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13454     if (pool_id < 0) {
13455       ss << "unrecognized pool '" << poolstr << "'";
13456       err = -ENOENT;
13457       goto reply;
13458     }
13459     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13460     ceph_assert(p);
13461     if (!p->is_tier()) {
13462       ss << "pool '" << poolstr << "' is not a tier";
13463       err = -EINVAL;
13464       goto reply;
13465     }
13466     string modestr;
13467     cmd_getval(cmdmap, "mode", modestr);
13468     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13469     if (int(mode) < 0) {
13470       ss << "'" << modestr << "' is not a valid cache mode";
13471       err = -EINVAL;
13472       goto reply;
13473     }
13474
13475     bool sure = false;
13476     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13477
13478     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13479         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13480       ss << "'" << modestr << "' is no longer a supported cache mode";
13481       err = -EPERM;
13482       goto reply;
13483     }
13484     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13485          mode != pg_pool_t::CACHEMODE_NONE &&
13486          mode != pg_pool_t::CACHEMODE_PROXY &&
13487          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13488          !sure) {
13489       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13490          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13491       err = -EPERM;
13492       goto reply;
13493     }
13494
13495     // pool already has this cache-mode set and there are no pending changes
13496     if (p->cache_mode == mode &&
13497         (pending_inc.new_pools.count(pool_id) == 0 ||
13498          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13499       ss << "set cache-mode for pool '" << poolstr << "'"
13500          << " to " << pg_pool_t::get_cache_mode_name(mode);
13501       err = 0;
13502       goto reply;
13503     }
13504
13505     /* Mode description:
13506      *
13507      *  none:       No cache-mode defined
13508      *  forward:    Forward all reads and writes to base pool [removed]
13509      *  writeback:  Cache writes, promote reads from base pool
13510      *  readonly:   Forward writes to base pool
13511      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13512      *  proxy:       Proxy all reads and writes to base pool
13513      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13514      *
13515      * Hence, these are the allowed transitions:
13516      *
13517      *  none -> any
13518      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13519      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13520      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13521      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13522      *  writeback -> readproxy || proxy
13523      *  readonly -> any
13524      */
13525
13526     // We check if the transition is valid against the current pool mode, as
13527     // it is the only committed state thus far.  We will blantly squash
13528     // whatever mode is on the pending state.
13529
13530     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13531         (mode != pg_pool_t::CACHEMODE_PROXY &&
13532           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13533       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13534          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13535          << "' pool; only '"
13536          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13537         << "' allowed.";
13538       err = -EINVAL;
13539       goto reply;
13540     }
13541     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13542         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13543           mode != pg_pool_t::CACHEMODE_PROXY &&
13544           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13545
13546         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13547         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13548           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13549
13550         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13551         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13552           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13553
13554         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13555         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13556           mode != pg_pool_t::CACHEMODE_PROXY &&
13557           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13558
13559       const pool_stat_t* pstats =
13560         mon.mgrstatmon()->get_pool_stat(pool_id);
13561
13562       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13563         ss << "unable to set cache-mode '"
13564            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13565            << "': dirty objects found";
13566         err = -EBUSY;
13567         goto reply;
13568       }
13569     }
13570     // go
13571     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13572     np->cache_mode = mode;
13573     // set this both when moving to and from cache_mode NONE.  this is to
13574     // capture legacy pools that were set up before this flag existed.
13575     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13576     ss << "set cache-mode for pool '" << poolstr
13577         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13578     if (mode == pg_pool_t::CACHEMODE_NONE) {
13579       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13580       ceph_assert(base_pool);
13581       if (base_pool->read_tier == pool_id ||
13582           base_pool->write_tier == pool_id)
13583         ss <<" (WARNING: pool is still configured as read or write tier)";
13584     }
13585     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13586                                               get_last_committed() + 1));
13587     return true;
13588   } else if (prefix == "osd tier add-cache") {
13589     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13590     if (err == -EAGAIN)
13591       goto wait;
13592     if (err)
13593       goto reply;
13594     string poolstr;
13595     cmd_getval(cmdmap, "pool", poolstr);
13596     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13597     if (pool_id < 0) {
13598       ss << "unrecognized pool '" << poolstr << "'";
13599       err = -ENOENT;
13600       goto reply;
13601     }
13602     string tierpoolstr;
13603     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13604     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13605     if (tierpool_id < 0) {
13606       ss << "unrecognized pool '" << tierpoolstr << "'";
13607       err = -ENOENT;
13608       goto reply;
13609     }
13610     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13611     ceph_assert(p);
13612     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13613     ceph_assert(tp);
13614
13615     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13616       goto reply;
13617     }
13618
13619     int64_t size = 0;
13620     if (!cmd_getval(cmdmap, "size", size)) {
13621       ss << "unable to parse 'size' value '"
13622          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13623       err = -EINVAL;
13624       goto reply;
13625     }
13626     // make sure new tier is empty
13627     const pool_stat_t *pstats =
13628       mon.mgrstatmon()->get_pool_stat(tierpool_id);
13629     if (pstats && pstats->stats.sum.num_objects != 0) {
13630       ss << "tier pool '" << tierpoolstr << "' is not empty";
13631       err = -ENOTEMPTY;
13632       goto reply;
13633     }
13634     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13635     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13636     if (int(mode) < 0) {
13637       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13638       err = -EINVAL;
13639       goto reply;
13640     }
13641     HitSet::Params hsp;
13642     auto& cache_hit_set_type =
13643       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13644     if (cache_hit_set_type == "bloom") {
13645       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13646       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13647       hsp = HitSet::Params(bsp);
13648     } else if (cache_hit_set_type == "explicit_hash") {
13649       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13650     } else if (cache_hit_set_type == "explicit_object") {
13651       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13652     } else {
13653       ss << "osd tier cache default hit set type '"
13654          << cache_hit_set_type << "' is not a known type";
13655       err = -EINVAL;
13656       goto reply;
13657     }
13658     // go
13659     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13660     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13661     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13662       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13663       return true;
13664     }
13665     np->tiers.insert(tierpool_id);
13666     np->read_tier = np->write_tier = tierpool_id;
13667     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13668     np->set_last_force_op_resend(pending_inc.epoch);
13669     ntp->set_last_force_op_resend(pending_inc.epoch);
13670     ntp->tier_of = pool_id;
13671     ntp->cache_mode = mode;
13672     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13673     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13674     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13675     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13676     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13677     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13678     ntp->hit_set_params = hsp;
13679     ntp->target_max_bytes = size;
13680     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13681     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13682                                               get_last_committed() + 1));
13683     return true;
13684   } else if (prefix == "osd pool set-quota") {
13685     string poolstr;
13686     cmd_getval(cmdmap, "pool", poolstr);
13687     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13688     if (pool_id < 0) {
13689       ss << "unrecognized pool '" << poolstr << "'";
13690       err = -ENOENT;
13691       goto reply;
13692     }
13693
13694     string field;
13695     cmd_getval(cmdmap, "field", field);
13696     if (field != "max_objects" && field != "max_bytes") {
13697       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13698       err = -EINVAL;
13699       goto reply;
13700     }
13701
13702     // val could contain unit designations, so we treat as a string
13703     string val;
13704     cmd_getval(cmdmap, "val", val);
13705     string tss;
13706     int64_t value;
13707     if (field == "max_objects") {
13708       value = strict_si_cast<uint64_t>(val, &tss);
13709     } else if (field == "max_bytes") {
13710       value = strict_iecstrtoll(val, &tss);
13711     } else {
13712       ceph_abort_msg("unrecognized option");
13713     }
13714     if (!tss.empty()) {
13715       ss << "error parsing value '" << val << "': " << tss;
13716       err = -EINVAL;
13717       goto reply;
13718     }
13719
13720     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13721     if (field == "max_objects") {
13722       pi->quota_max_objects = value;
13723     } else if (field == "max_bytes") {
13724       pi->quota_max_bytes = value;
13725     } else {
13726       ceph_abort_msg("unrecognized option");
13727     }
13728     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13729     rs = ss.str();
13730     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13731                                               get_last_committed() + 1));
13732     return true;
13733   } else if (prefix == "osd pool application enable" ||
13734              prefix == "osd pool application disable" ||
13735              prefix == "osd pool application set" ||
13736              prefix == "osd pool application rm") {
13737     err = prepare_command_pool_application(prefix, cmdmap, ss);
13738     if (err == -EAGAIN) {
13739       goto wait;
13740     } else if (err < 0) {
13741       goto reply;
13742     } else {
13743       goto update;
13744     }
13745   } else if (prefix == "osd force-create-pg") {
13746     pg_t pgid;
13747     string pgidstr;
13748     cmd_getval(cmdmap, "pgid", pgidstr);
13749     if (!pgid.parse(pgidstr.c_str())) {
13750       ss << "invalid pgid '" << pgidstr << "'";
13751       err = -EINVAL;
13752       goto reply;
13753     }
13754     if (!osdmap.pg_exists(pgid)) {
13755       ss << "pg " << pgid << " should not exist";
13756       err = -ENOENT;
13757       goto reply;
13758     }
13759     bool sure = false;
13760     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13761     if (!sure) {
13762       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13763          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13764          << "only if you are certain that all copies of the PG are in fact lost and you are "
13765          << "willing to accept that the data is permanently destroyed.  Pass "
13766          << "--yes-i-really-mean-it to proceed.";
13767       err = -EPERM;
13768       goto reply;
13769     }
13770     bool creating_now;
13771     {
13772       std::lock_guard<std::mutex> l(creating_pgs_lock);
13773       auto emplaced = creating_pgs.pgs.emplace(
13774         pgid,
13775         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13776                                        ceph_clock_now()));
13777       creating_now = emplaced.second;
13778     }
13779     if (creating_now) {
13780       ss << "pg " << pgidstr << " now creating, ok";
13781       // set the pool's CREATING flag so that (1) the osd won't ignore our
13782       // create message and (2) we won't propose any future pg_num changes
13783       // until after the PG has been instantiated.
13784       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13785         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13786       }
13787       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13788       err = 0;
13789       goto update;
13790     } else {
13791       ss << "pg " << pgid << " already creating";
13792       err = 0;
13793       goto reply;
13794     }
13795   } else if (prefix == "osd force_healthy_stretch_mode") {
13796     bool sure = false;
13797     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13798     if (!sure) {
13799       ss << "This command will require peering across multiple CRUSH buckets "
13800         "(probably two data centers or availability zones?) and may result in PGs "
13801         "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13802       err = -EPERM;
13803       goto reply;
13804     }
13805     try_end_recovery_stretch_mode(true);
13806     ss << "Triggering healthy stretch mode";
13807     err = 0;
13808     goto reply;
13809   } else if (prefix == "osd force_recovery_stretch_mode") {
13810     bool sure = false;
13811     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13812     if (!sure) {
13813       ss << "This command will increase pool sizes to try and spread them "
13814         "across multiple CRUSH buckets (probably two data centers or "
13815         "availability zones?) and should have happened automatically"
13816         "Pass --yes-i-really-mean-it to proceed.";
13817       err = -EPERM;
13818       goto reply;
13819     }
13820     mon.go_recovery_stretch_mode();
13821     ss << "Triggering recovery stretch mode";
13822     err = 0;
13823     goto reply;
13824   } else {
13825     err = -EINVAL;
13826   }
13827
13828  reply:
13829   getline(ss, rs);
13830   if (err < 0 && rs.length() == 0)
13831     rs = cpp_strerror(err);
13832   mon.reply_command(op, err, rs, rdata, get_last_committed());
13833   return ret;
13834
13835  update:
13836   getline(ss, rs);
13837   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13838                                             get_last_committed() + 1));
13839   return true;
13840
13841  wait:
13842   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13843   return true;
13844 }
13845
13846 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13847 {
13848   op->mark_osdmon_event(__func__);
13849
13850   auto m = op->get_req<MPoolOp>();
13851   MonSession *session = op->get_session();
13852   if (!session) {
13853     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13854     return true;
13855   }
13856
13857   switch (m->op) {
13858   case POOL_OP_CREATE_UNMANAGED_SNAP:
13859   case POOL_OP_DELETE_UNMANAGED_SNAP:
13860     {
13861       const std::string* pool_name = nullptr;
13862       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13863       if (pg_pool != nullptr) {
13864         pool_name = &osdmap.get_pool_name(m->pool);
13865       }
13866
13867       if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13868                                           session->entity_name, session->caps,
13869                                           session->get_peer_socket_addr(),
13870                                           pool_name)) {
13871         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13872                 << "privileges. message: " << *m  << std::endl
13873                 << "caps: " << session->caps << dendl;
13874         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13875         return true;
13876       }
13877     }
13878     break;
13879   default:
13880     if (!session->is_capable("osd", MON_CAP_W)) {
13881       dout(0) << "got pool op from entity with insufficient privileges. "
13882               << "message: " << *m  << std::endl
13883               << "caps: " << session->caps << dendl;
13884       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13885       return true;
13886     }
13887     break;
13888   }
13889
13890   return false;
13891 }
13892
13893 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13894 {
13895   op->mark_osdmon_event(__func__);
13896   auto m = op->get_req<MPoolOp>();
13897
13898   if (enforce_pool_op_caps(op)) {
13899     return true;
13900   }
13901
13902   if (m->fsid != mon.monmap->fsid) {
13903     dout(0) << __func__ << " drop message on fsid " << m->fsid
13904             << " != " << mon.monmap->fsid << " for " << *m << dendl;
13905     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13906     return true;
13907   }
13908
13909   if (m->op == POOL_OP_CREATE)
13910     return preprocess_pool_op_create(op);
13911
13912   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13913   if (p == nullptr) {
13914     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13915     if (m->op == POOL_OP_DELETE) {
13916       _pool_op_reply(op, 0, osdmap.get_epoch());
13917     } else {
13918       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13919     }
13920     return true;
13921   }
13922
13923   // check if the snap and snapname exist
13924   bool snap_exists = false;
13925   if (p->snap_exists(m->name.c_str()))
13926     snap_exists = true;
13927
13928   switch (m->op) {
13929   case POOL_OP_CREATE_SNAP:
13930     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13931       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13932       return true;
13933     }
13934     if (snap_exists) {
13935       _pool_op_reply(op, 0, osdmap.get_epoch());
13936       return true;
13937     }
13938     return false;
13939   case POOL_OP_CREATE_UNMANAGED_SNAP:
13940     if (p->is_pool_snaps_mode()) {
13941       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13942       return true;
13943     }
13944     return false;
13945   case POOL_OP_DELETE_SNAP:
13946     if (p->is_unmanaged_snaps_mode()) {
13947       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13948       return true;
13949     }
13950     if (!snap_exists) {
13951       _pool_op_reply(op, 0, osdmap.get_epoch());
13952       return true;
13953     }
13954     return false;
13955   case POOL_OP_DELETE_UNMANAGED_SNAP:
13956     if (p->is_pool_snaps_mode()) {
13957       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13958       return true;
13959     }
13960     if (_is_removed_snap(m->pool, m->snapid)) {
13961       _pool_op_reply(op, 0, osdmap.get_epoch());
13962       return true;
13963     }
13964     return false;
13965   case POOL_OP_DELETE:
13966     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13967       _pool_op_reply(op, 0, osdmap.get_epoch());
13968       return true;
13969     }
13970     return false;
13971   case POOL_OP_AUID_CHANGE:
13972     return false;
13973   default:
13974     ceph_abort();
13975     break;
13976   }
13977
13978   return false;
13979 }
13980
13981 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13982 {
13983   if (!osdmap.have_pg_pool(pool)) {
13984     dout(10) << __func__ << " pool " << pool << " snap " << snap
13985              << " - pool dne" << dendl;
13986     return true;
13987   }
13988   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13989     dout(10) << __func__ << " pool " << pool << " snap " << snap
13990              << " - in osdmap removed_snaps_queue" << dendl;
13991     return true;
13992   }
13993   snapid_t begin, end;
13994   int r = lookup_purged_snap(pool, snap, &begin, &end);
13995   if (r == 0) {
13996     dout(10) << __func__ << " pool " << pool << " snap " << snap
13997              << " - purged, [" << begin << "," << end << ")" << dendl;
13998     return true;
13999   }
14000   return false;
14001 }
14002
14003 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
14004 {
14005   if (pending_inc.old_pools.count(pool)) {
14006     dout(10) << __func__ << " pool " << pool << " snap " << snap
14007              << " - pool pending deletion" << dendl;
14008     return true;
14009   }
14010   if (pending_inc.in_new_removed_snaps(pool, snap)) {
14011     dout(10) << __func__ << " pool " << pool << " snap " << snap
14012              << " - in pending new_removed_snaps" << dendl;
14013     return true;
14014   }
14015   return false;
14016 }
14017
14018 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
14019 {
14020   op->mark_osdmon_event(__func__);
14021   auto m = op->get_req<MPoolOp>();
14022   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
14023   if (pool >= 0) {
14024     _pool_op_reply(op, 0, osdmap.get_epoch());
14025     return true;
14026   }
14027
14028   return false;
14029 }
14030
14031 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
14032 {
14033   op->mark_osdmon_event(__func__);
14034   auto m = op->get_req<MPoolOp>();
14035   dout(10) << "prepare_pool_op " << *m << dendl;
14036   if (m->op == POOL_OP_CREATE) {
14037     return prepare_pool_op_create(op);
14038   } else if (m->op == POOL_OP_DELETE) {
14039     return prepare_pool_op_delete(op);
14040   }
14041
14042   int ret = 0;
14043   bool changed = false;
14044
14045   if (!osdmap.have_pg_pool(m->pool)) {
14046     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14047     return false;
14048   }
14049
14050   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
14051
14052   switch (m->op) {
14053     case POOL_OP_CREATE_SNAP:
14054       if (pool->is_tier()) {
14055         ret = -EINVAL;
14056         _pool_op_reply(op, ret, osdmap.get_epoch());
14057         return false;
14058       }  // else, fall through
14059     case POOL_OP_DELETE_SNAP:
14060       if (!pool->is_unmanaged_snaps_mode()) {
14061         bool snap_exists = pool->snap_exists(m->name.c_str());
14062         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
14063           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
14064           ret = 0;
14065         } else {
14066           break;
14067         }
14068       } else {
14069         ret = -EINVAL;
14070       }
14071       _pool_op_reply(op, ret, osdmap.get_epoch());
14072       return false;
14073
14074     case POOL_OP_DELETE_UNMANAGED_SNAP:
14075       // we won't allow removal of an unmanaged snapshot from a pool
14076       // not in unmanaged snaps mode.
14077       if (!pool->is_unmanaged_snaps_mode()) {
14078         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
14079         return false;
14080       }
14081       /* fall-thru */
14082     case POOL_OP_CREATE_UNMANAGED_SNAP:
14083       // but we will allow creating an unmanaged snapshot on any pool
14084       // as long as it is not in 'pool' snaps mode.
14085       if (pool->is_pool_snaps_mode()) {
14086         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
14087         return false;
14088       }
14089   }
14090
14091   // projected pool info
14092   pg_pool_t pp;
14093   if (pending_inc.new_pools.count(m->pool))
14094     pp = pending_inc.new_pools[m->pool];
14095   else
14096     pp = *osdmap.get_pg_pool(m->pool);
14097
14098   bufferlist reply_data;
14099
14100   // pool snaps vs unmanaged snaps are mutually exclusive
14101   switch (m->op) {
14102   case POOL_OP_CREATE_SNAP:
14103   case POOL_OP_DELETE_SNAP:
14104     if (pp.is_unmanaged_snaps_mode()) {
14105       ret = -EINVAL;
14106       goto out;
14107     }
14108     break;
14109
14110   case POOL_OP_CREATE_UNMANAGED_SNAP:
14111   case POOL_OP_DELETE_UNMANAGED_SNAP:
14112     if (pp.is_pool_snaps_mode()) {
14113       ret = -EINVAL;
14114       goto out;
14115     }
14116   }
14117
14118   switch (m->op) {
14119   case POOL_OP_CREATE_SNAP:
14120     if (!pp.snap_exists(m->name.c_str())) {
14121       pp.add_snap(m->name.c_str(), ceph_clock_now());
14122       dout(10) << "create snap in pool " << m->pool << " " << m->name
14123                << " seq " << pp.get_snap_epoch() << dendl;
14124       changed = true;
14125     }
14126     break;
14127
14128   case POOL_OP_DELETE_SNAP:
14129     {
14130       snapid_t s = pp.snap_exists(m->name.c_str());
14131       if (s) {
14132         pp.remove_snap(s);
14133         pending_inc.new_removed_snaps[m->pool].insert(s);
14134         changed = true;
14135       }
14136     }
14137     break;
14138
14139   case POOL_OP_CREATE_UNMANAGED_SNAP:
14140     {
14141       uint64_t snapid = pp.add_unmanaged_snap(
14142         osdmap.require_osd_release < ceph_release_t::octopus);
14143       encode(snapid, reply_data);
14144       changed = true;
14145     }
14146     break;
14147
14148   case POOL_OP_DELETE_UNMANAGED_SNAP:
14149     if (!_is_removed_snap(m->pool, m->snapid) &&
14150         !_is_pending_removed_snap(m->pool, m->snapid)) {
14151       if (m->snapid > pp.get_snap_seq()) {
14152         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
14153         return false;
14154       }
14155       pp.remove_unmanaged_snap(
14156         m->snapid,
14157         osdmap.require_osd_release < ceph_release_t::octopus);
14158       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
14159       // also record the new seq as purged: this avoids a discontinuity
14160       // after all of the snaps have been purged, since the seq assigned
14161       // during removal lives in the same namespace as the actual snaps.
14162       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
14163       changed = true;
14164     }
14165     break;
14166
14167   case POOL_OP_AUID_CHANGE:
14168     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
14169     return false;
14170
14171   default:
14172     ceph_abort();
14173     break;
14174   }
14175
14176   if (changed) {
14177     pp.set_snap_epoch(pending_inc.epoch);
14178     pending_inc.new_pools[m->pool] = pp;
14179   }
14180
14181  out:
14182   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
14183   return true;
14184 }
14185
14186 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
14187 {
14188   op->mark_osdmon_event(__func__);
14189   int err = prepare_new_pool(op);
14190   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
14191   return true;
14192 }
14193
14194 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
14195                                    ostream *ss)
14196 {
14197   const string& poolstr = osdmap.get_pool_name(pool_id);
14198
14199   // If the Pool is in use by CephFS, refuse to delete it
14200   FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14201   if (pending_fsmap.pool_in_use(pool_id)) {
14202     *ss << "pool '" << poolstr << "' is in use by CephFS";
14203     return -EBUSY;
14204   }
14205
14206   if (pool.tier_of >= 0) {
14207     *ss << "pool '" << poolstr << "' is a tier of '"
14208         << osdmap.get_pool_name(pool.tier_of) << "'";
14209     return -EBUSY;
14210   }
14211   if (!pool.tiers.empty()) {
14212     *ss << "pool '" << poolstr << "' has tiers";
14213     for(auto tier : pool.tiers) {
14214       *ss << " " << osdmap.get_pool_name(tier);
14215     }
14216     return -EBUSY;
14217   }
14218
14219   if (!g_conf()->mon_allow_pool_delete) {
14220     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
14221     return -EPERM;
14222   }
14223
14224   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
14225     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
14226     return -EPERM;
14227   }
14228
14229   *ss << "pool '" << poolstr << "' removed";
14230   return 0;
14231 }
14232
14233 /**
14234  * Check if it is safe to add a tier to a base pool
14235  *
14236  * @return
14237  * True if the operation should proceed, false if we should abort here
14238  * (abort doesn't necessarily mean error, could be idempotency)
14239  */
14240 bool OSDMonitor::_check_become_tier(
14241     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14242     const int64_t base_pool_id, const pg_pool_t *base_pool,
14243     int *err,
14244     ostream *ss) const
14245 {
14246   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14247   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14248
14249   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14250   if (pending_fsmap.pool_in_use(tier_pool_id)) {
14251     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14252     *err = -EBUSY;
14253     return false;
14254   }
14255
14256   if (base_pool->tiers.count(tier_pool_id)) {
14257     ceph_assert(tier_pool->tier_of == base_pool_id);
14258     *err = 0;
14259     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14260       << base_pool_name << "'";
14261     return false;
14262   }
14263
14264   if (base_pool->is_tier()) {
14265     *ss << "pool '" << base_pool_name << "' is already a tier of '"
14266       << osdmap.get_pool_name(base_pool->tier_of) << "', "
14267       << "multiple tiers are not yet supported.";
14268     *err = -EINVAL;
14269     return false;
14270   }
14271
14272   if (tier_pool->has_tiers()) {
14273     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14274     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14275          it != tier_pool->tiers.end(); ++it)
14276       *ss << "'" << osdmap.get_pool_name(*it) << "',";
14277     *ss << " multiple tiers are not yet supported.";
14278     *err = -EINVAL;
14279     return false;
14280   }
14281
14282   if (tier_pool->is_tier()) {
14283     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14284        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14285     *err = -EINVAL;
14286     return false;
14287   }
14288
14289   *err = 0;
14290   return true;
14291 }
14292
14293
14294 /**
14295  * Check if it is safe to remove a tier from this base pool
14296  *
14297  * @return
14298  * True if the operation should proceed, false if we should abort here
14299  * (abort doesn't necessarily mean error, could be idempotency)
14300  */
14301 bool OSDMonitor::_check_remove_tier(
14302     const int64_t base_pool_id, const pg_pool_t *base_pool,
14303     const pg_pool_t *tier_pool,
14304     int *err, ostream *ss) const
14305 {
14306   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14307
14308   // Apply CephFS-specific checks
14309   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14310   if (pending_fsmap.pool_in_use(base_pool_id)) {
14311     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14312       // If the underlying pool is erasure coded and does not allow EC
14313       // overwrites, we can't permit the removal of the replicated tier that
14314       // CephFS relies on to access it
14315       *ss << "pool '" << base_pool_name <<
14316           "' does not allow EC overwrites and is in use by CephFS"
14317           " via its tier";
14318       *err = -EBUSY;
14319       return false;
14320     }
14321
14322     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14323       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14324              "tier is still in use as a writeback cache.  Change the cache "
14325              "mode and flush the cache before removing it";
14326       *err = -EBUSY;
14327       return false;
14328     }
14329   }
14330
14331   *err = 0;
14332   return true;
14333 }
14334
14335 int OSDMonitor::_prepare_remove_pool(
14336   int64_t pool, ostream *ss, bool no_fake)
14337 {
14338   dout(10) << __func__ << " " << pool << dendl;
14339   const pg_pool_t *p = osdmap.get_pg_pool(pool);
14340   int r = _check_remove_pool(pool, *p, ss);
14341   if (r < 0)
14342     return r;
14343
14344   auto new_pool = pending_inc.new_pools.find(pool);
14345   if (new_pool != pending_inc.new_pools.end()) {
14346     // if there is a problem with the pending info, wait and retry
14347     // this op.
14348     const auto& p = new_pool->second;
14349     int r = _check_remove_pool(pool, p, ss);
14350     if (r < 0)
14351       return -EAGAIN;
14352   }
14353
14354   if (pending_inc.old_pools.count(pool)) {
14355     dout(10) << __func__ << " " << pool << " already pending removal"
14356              << dendl;
14357     return 0;
14358   }
14359
14360   if (g_conf()->mon_fake_pool_delete && !no_fake) {
14361     string old_name = osdmap.get_pool_name(pool);
14362     string new_name = old_name + "." + stringify(pool) + ".DELETED";
14363     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14364             << old_name << " -> " << new_name << dendl;
14365     pending_inc.new_pool_names[pool] = new_name;
14366     return 0;
14367   }
14368
14369   // remove
14370   pending_inc.old_pools.insert(pool);
14371
14372   // remove any pg_temp mappings for this pool
14373   for (auto p = osdmap.pg_temp->begin();
14374        p != osdmap.pg_temp->end();
14375        ++p) {
14376     if (p->first.pool() == pool) {
14377       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14378                << p->first << dendl;
14379       pending_inc.new_pg_temp[p->first].clear();
14380     }
14381   }
14382   // remove any primary_temp mappings for this pool
14383   for (auto p = osdmap.primary_temp->begin();
14384       p != osdmap.primary_temp->end();
14385       ++p) {
14386     if (p->first.pool() == pool) {
14387       dout(10) << __func__ << " " << pool
14388                << " removing obsolete primary_temp" << p->first << dendl;
14389       pending_inc.new_primary_temp[p->first] = -1;
14390     }
14391   }
14392   // remove any pg_upmap mappings for this pool
14393   for (auto& p : osdmap.pg_upmap) {
14394     if (p.first.pool() == pool) {
14395       dout(10) << __func__ << " " << pool
14396                << " removing obsolete pg_upmap "
14397                << p.first << dendl;
14398       pending_inc.old_pg_upmap.insert(p.first);
14399     }
14400   }
14401   // remove any pending pg_upmap mappings for this pool
14402   {
14403     auto it = pending_inc.new_pg_upmap.begin();
14404     while (it != pending_inc.new_pg_upmap.end()) {
14405       if (it->first.pool() == pool) {
14406         dout(10) << __func__ << " " << pool
14407                  << " removing pending pg_upmap "
14408                  << it->first << dendl;
14409         it = pending_inc.new_pg_upmap.erase(it);
14410       } else {
14411         it++;
14412       }
14413     }
14414   }
14415   // remove any pg_upmap_items mappings for this pool
14416   for (auto& p : osdmap.pg_upmap_items) {
14417     if (p.first.pool() == pool) {
14418       dout(10) << __func__ << " " << pool
14419                << " removing obsolete pg_upmap_items " << p.first
14420                << dendl;
14421       pending_inc.old_pg_upmap_items.insert(p.first);
14422     }
14423   }
14424   // remove any pending pg_upmap mappings for this pool
14425   {
14426     auto it = pending_inc.new_pg_upmap_items.begin();
14427     while (it != pending_inc.new_pg_upmap_items.end()) {
14428       if (it->first.pool() == pool) {
14429         dout(10) << __func__ << " " << pool
14430                  << " removing pending pg_upmap_items "
14431                  << it->first << dendl;
14432         it = pending_inc.new_pg_upmap_items.erase(it);
14433       } else {
14434         it++;
14435       }
14436     }
14437   }
14438
14439   // remove any choose_args for this pool
14440   CrushWrapper newcrush = _get_pending_crush();
14441   if (newcrush.have_choose_args(pool)) {
14442     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14443     newcrush.rm_choose_args(pool);
14444     pending_inc.crush.clear();
14445     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14446   }
14447   return 0;
14448 }
14449
14450 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14451 {
14452   dout(10) << "_prepare_rename_pool " << pool << dendl;
14453   if (pending_inc.old_pools.count(pool)) {
14454     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14455     return -ENOENT;
14456   }
14457   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14458        p != pending_inc.new_pool_names.end();
14459        ++p) {
14460     if (p->second == newname && p->first != pool) {
14461       return -EEXIST;
14462     }
14463   }
14464
14465   pending_inc.new_pool_names[pool] = newname;
14466   return 0;
14467 }
14468
14469 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14470 {
14471   op->mark_osdmon_event(__func__);
14472   auto m = op->get_req<MPoolOp>();
14473   ostringstream ss;
14474   int ret = _prepare_remove_pool(m->pool, &ss, false);
14475   if (ret == -EAGAIN) {
14476     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14477     return true;
14478   }
14479   if (ret < 0)
14480     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14481   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14482                                                       pending_inc.epoch));
14483   return true;
14484 }
14485
14486 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14487                                 int ret, epoch_t epoch, bufferlist *blp)
14488 {
14489   op->mark_osdmon_event(__func__);
14490   auto m = op->get_req<MPoolOp>();
14491   dout(20) << "_pool_op_reply " << ret << dendl;
14492   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14493                                          ret, epoch, get_last_committed(), blp);
14494   mon.send_reply(op, reply);
14495 }
14496
14497 void OSDMonitor::convert_pool_priorities(void)
14498 {
14499   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14500   int64_t max_prio = 0;
14501   int64_t min_prio = 0;
14502   for (const auto &i : osdmap.get_pools()) {
14503     const auto &pool = i.second;
14504
14505     if (pool.opts.is_set(key)) {
14506       int64_t prio = 0;
14507       pool.opts.get(key, &prio);
14508       if (prio > max_prio)
14509         max_prio = prio;
14510       if (prio < min_prio)
14511         min_prio = prio;
14512     }
14513   }
14514   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14515     dout(20) << __func__ << " nothing to fix" << dendl;
14516     return;
14517   }
14518   // Current pool priorities exceeds new maximum
14519   for (const auto &i : osdmap.get_pools()) {
14520     const auto pool_id = i.first;
14521     pg_pool_t pool = i.second;
14522
14523     int64_t prio = 0;
14524     pool.opts.get(key, &prio);
14525     int64_t n;
14526
14527     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14528       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14529       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14530     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14531       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14532       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14533     } else {
14534       continue;
14535     }
14536     if (n == 0) {
14537       pool.opts.unset(key);
14538     } else {
14539       pool.opts.set(key, static_cast<int64_t>(n));
14540     }
14541     dout(10) << __func__ << " pool " << pool_id
14542              << " recovery_priority adjusted "
14543              << prio << " to " << n << dendl;
14544     pool.last_change = pending_inc.epoch;
14545     pending_inc.new_pools[pool_id] = pool;
14546   }
14547 }
14548
14549 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14550                                                int *errcode,
14551                                                set<pg_pool_t*>* pools,
14552                                                const string& new_crush_rule)
14553 {
14554   dout(20) << __func__ << dendl;
14555   *okay = false;
14556   int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14557   if (new_crush_rule_result < 0) {
14558     ss << "unrecognized crush rule " << new_crush_rule_result;
14559     *errcode = new_crush_rule_result;
14560     return;
14561   }
14562   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14563   for (const auto& pooli : osdmap.pools) {
14564     int64_t poolid = pooli.first;
14565     const pg_pool_t *p = &pooli.second;
14566     if (!p->is_replicated()) {
14567       ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14568       *errcode = -EINVAL;
14569       return;
14570     }
14571     uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14572     if ((p->get_size() != default_size ||
14573          (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14574         (p->get_crush_rule() != new_rule)) {
14575       ss << "we currently require stretch mode pools start out with the"
14576         " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14577       *errcode = -EINVAL;
14578       return;
14579     }
14580     pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14581     // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14582     // the attempt may fail and then we have these pool updates...but they won't do anything
14583     // if there is a failure, so if it's hard to change the interface, no need to bother
14584     pools->insert(pp);
14585   }
14586   *okay = true;
14587   return;
14588 }
14589
14590 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14591                                          int *errcode, bool commit,
14592                                          const string& dividing_bucket,
14593                                          uint32_t bucket_count,
14594                                          const set<pg_pool_t*>& pools,
14595                                          const string& new_crush_rule)
14596 {
14597   dout(20) << __func__ << dendl;
14598   *okay = false;
14599   CrushWrapper crush = _get_pending_crush();
14600   int dividing_id = -1;
14601   if (auto type_id = crush.get_validated_type_id(dividing_bucket);
14602       !type_id.has_value()) {
14603     ss << dividing_bucket << " is not a valid crush bucket type";
14604     *errcode = -ENOENT;
14605     ceph_assert(!commit);
14606     return;
14607   } else {
14608     dividing_id = *type_id;
14609   }
14610   vector<int> subtrees;
14611   crush.get_subtree_of_type(dividing_id, &subtrees);
14612   if (subtrees.size() != 2) {
14613     ss << "there are " << subtrees.size() << dividing_bucket
14614        << "'s in the cluster but stretch mode currently only works with 2!";
14615     *errcode = -EINVAL;
14616     ceph_assert(!commit || subtrees.size() == 2);
14617     return;
14618   }
14619
14620   int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14621   if (new_crush_rule_result < 0) {
14622     ss << "unrecognized crush rule " << new_crush_rule;
14623     *errcode = new_crush_rule_result;
14624     ceph_assert(!commit || (new_crush_rule_result > 0));
14625     return;
14626   }
14627   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14628
14629   int weight1 = crush.get_item_weight(subtrees[0]);
14630   int weight2 = crush.get_item_weight(subtrees[1]);
14631   if (weight1 != weight2) {
14632     // TODO: I'm really not sure this is a good idea?
14633     ss << "the 2 " << dividing_bucket
14634        << "instances in the cluster have differing weights "
14635        << weight1 << " and " << weight2
14636        <<" but stretch mode currently requires they be the same!";
14637     *errcode = -EINVAL;
14638     ceph_assert(!commit || (weight1 == weight2));
14639     return;
14640   }
14641   if (bucket_count != 2) {
14642     ss << "currently we only support 2-site stretch clusters!";
14643     *errcode = -EINVAL;
14644     ceph_assert(!commit || bucket_count == 2);
14645     return;
14646   }
14647   // TODO: check CRUSH rules for pools so that we are appropriately divided
14648   if (commit) {
14649     for (auto pool : pools) {
14650       pool->crush_rule = new_rule;
14651       pool->peering_crush_bucket_count = bucket_count;
14652       pool->peering_crush_bucket_target = bucket_count;
14653       pool->peering_crush_bucket_barrier = dividing_id;
14654       pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14655       pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14656       pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14657     }
14658     pending_inc.change_stretch_mode = true;
14659     pending_inc.stretch_mode_enabled = true;
14660     pending_inc.new_stretch_bucket_count = bucket_count;
14661     pending_inc.new_degraded_stretch_mode = 0;
14662     pending_inc.new_stretch_mode_bucket = dividing_id;
14663   }
14664   *okay = true;
14665   return;
14666 }
14667
14668 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14669                                             set<int> *really_down_buckets,
14670                                             set<string> *really_down_mons)
14671 {
14672   dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14673   ceph_assert(is_readable());
14674   if (dead_buckets.empty()) return false;
14675   set<int> down_cache;
14676   bool really_down = false;
14677   for (auto dbi : dead_buckets) {
14678     const string& bucket_name = dbi.first;
14679     ceph_assert(osdmap.crush->name_exists(bucket_name));
14680     int bucket_id = osdmap.crush->get_item_id(bucket_name);
14681     dout(20) << "Checking " << bucket_name << " id " << bucket_id
14682              << " to see if OSDs are also down" << dendl;
14683     bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14684     if (subtree_down) {
14685       dout(20) << "subtree is down!" << dendl;
14686       really_down = true;
14687       really_down_buckets->insert(bucket_id);
14688       really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14689     }
14690   }
14691   dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14692            << " and mons " << *really_down_mons << " are really down" << dendl;
14693   return really_down;
14694 }
14695
14696 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14697                                                const set<string>& live_zones)
14698 {
14699   dout(20) << __func__ << dendl;
14700   stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14701   // update the general OSDMap changes
14702   pending_inc.change_stretch_mode = true;
14703   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14704   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14705   int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14706   ceph_assert(new_site_count == 1); // stretch count 2!
14707   pending_inc.new_degraded_stretch_mode = new_site_count;
14708   pending_inc.new_recovering_stretch_mode = 0;
14709   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14710
14711   // and then apply them to all the pg_pool_ts
14712   ceph_assert(live_zones.size() == 1); // only support 2 zones now
14713   const string& remaining_site_name = *(live_zones.begin());
14714   ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14715   int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14716   for (auto pgi : osdmap.pools) {
14717     if (pgi.second.peering_crush_bucket_count) {
14718       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14719       newp.peering_crush_bucket_count = new_site_count;
14720       newp.peering_crush_mandatory_member = remaining_site;
14721       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14722       newp.set_last_force_op_resend(pending_inc.epoch);
14723     }
14724   }
14725   propose_pending();
14726 }
14727
14728 void OSDMonitor::trigger_recovery_stretch_mode()
14729 {
14730   dout(20) << __func__ << dendl;
14731   stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14732   pending_inc.change_stretch_mode = true;
14733   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14734   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14735   pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14736   pending_inc.new_recovering_stretch_mode = 1;
14737   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14738
14739   for (auto pgi : osdmap.pools) {
14740     if (pgi.second.peering_crush_bucket_count) {
14741       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14742       newp.set_last_force_op_resend(pending_inc.epoch);
14743     }
14744   }
14745   propose_pending();
14746 }
14747
14748 void OSDMonitor::set_degraded_stretch_mode()
14749 {
14750   stretch_recovery_triggered.set_from_double(0);
14751 }
14752
14753 void OSDMonitor::set_recovery_stretch_mode()
14754 {
14755   if (stretch_recovery_triggered.is_zero()) {
14756     stretch_recovery_triggered = ceph_clock_now();
14757   }
14758 }
14759
14760 void OSDMonitor::set_healthy_stretch_mode()
14761 {
14762   stretch_recovery_triggered.set_from_double(0);
14763 }
14764
14765 void OSDMonitor::notify_new_pg_digest()
14766 {
14767   dout(20) << __func__ << dendl;
14768   if (!stretch_recovery_triggered.is_zero()) {
14769     try_end_recovery_stretch_mode(false);
14770   }
14771 }
14772
14773 struct CMonExitRecovery : public Context {
14774   OSDMonitor *m;
14775   bool force;
14776   CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14777   void finish(int r) {
14778     m->try_end_recovery_stretch_mode(force);
14779   }
14780 };
14781
14782 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14783 {
14784   dout(20) << __func__ << dendl;
14785   if (!mon.is_leader()) return;
14786   if (!mon.is_degraded_stretch_mode()) return;
14787   if (!mon.is_recovering_stretch_mode()) return;
14788   if (!is_readable()) {
14789     wait_for_readable_ctx(new CMonExitRecovery(this, force));
14790     return;
14791   }
14792
14793   if (osdmap.recovering_stretch_mode &&
14794       ((!stretch_recovery_triggered.is_zero() &&
14795         ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14796         stretch_recovery_triggered) ||
14797        force)) {
14798     if (!mon.mgrstatmon()->is_readable()) {
14799       mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14800       return;
14801     }
14802     const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14803     double misplaced, degraded, inactive, unknown;
14804     pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14805     if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14806       // we can exit degraded stretch mode!
14807       mon.trigger_healthy_stretch_mode();
14808     }
14809   }
14810 }
14811
14812 void OSDMonitor::trigger_healthy_stretch_mode()
14813 {
14814   ceph_assert(is_writeable());
14815   stretch_recovery_triggered.set_from_double(0);
14816   pending_inc.change_stretch_mode = true;
14817   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14818   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14819   pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14820   pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14821   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14822   for (auto pgi : osdmap.pools) {
14823     if (pgi.second.peering_crush_bucket_count) {
14824       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14825       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14826       newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14827       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14828       newp.set_last_force_op_resend(pending_inc.epoch);
14829     }
14830   }
14831   propose_pending();
14832 }