ceph/src/mon/OSDMonitor.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  * Copyright (C) 2014 Red Hat <contact@redhat.com>
   9  *
  10  * Author: Loic Dachary <loic@dachary.org>
  11  *
  12  * This is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License version 2.1, as published by the Free Software
  15  * Foundation.  See file COPYING.
  16  *
  17  */
  18
  19 #include <algorithm>
  20 #include <boost/algorithm/string.hpp>
  21 #include <experimental/iterator>
  22 #include <locale>
  23 #include <sstream>
  24
  25 #include "mon/OSDMonitor.h"
  26 #include "mon/Monitor.h"
  27 #include "mon/MDSMonitor.h"
  28 #include "mon/MgrStatMonitor.h"
  29 #include "mon/AuthMonitor.h"
  30 #include "mon/KVMonitor.h"
  31
  32 #include "mon/MonitorDBStore.h"
  33 #include "mon/Session.h"
  34
  35 #include "crush/CrushWrapper.h"
  36 #include "crush/CrushTester.h"
  37 #include "crush/CrushTreeDumper.h"
  38
  39 #include "messages/MOSDBeacon.h"
  40 #include "messages/MOSDFailure.h"
  41 #include "messages/MOSDMarkMeDown.h"
  42 #include "messages/MOSDMarkMeDead.h"
  43 #include "messages/MOSDFull.h"
  44 #include "messages/MOSDMap.h"
  45 #include "messages/MMonGetOSDMap.h"
  46 #include "messages/MOSDBoot.h"
  47 #include "messages/MOSDAlive.h"
  48 #include "messages/MPoolOp.h"
  49 #include "messages/MPoolOpReply.h"
  50 #include "messages/MOSDPGCreate.h"
  51 #include "messages/MOSDPGCreate2.h"
  52 #include "messages/MOSDPGCreated.h"
  53 #include "messages/MOSDPGTemp.h"
  54 #include "messages/MOSDPGReadyToMerge.h"
  55 #include "messages/MMonCommand.h"
  56 #include "messages/MRemoveSnaps.h"
  57 #include "messages/MOSDScrub.h"
  58 #include "messages/MRoute.h"
  59 #include "messages/MMonGetPurgedSnaps.h"
  60 #include "messages/MMonGetPurgedSnapsReply.h"
  61
  62 #include "common/TextTable.h"
  63 #include "common/Timer.h"
  64 #include "common/ceph_argparse.h"
  65 #include "common/perf_counters.h"
  66 #include "common/PriorityCache.h"
  67 #include "common/strtol.h"
  68 #include "common/numa.h"
  69
  70 #include "common/config.h"
  71 #include "common/errno.h"
  72
  73 #include "erasure-code/ErasureCodePlugin.h"
  74 #include "compressor/Compressor.h"
  75 #include "common/Checksummer.h"
  76
  77 #include "include/compat.h"
  78 #include "include/ceph_assert.h"
  79 #include "include/stringify.h"
  80 #include "include/util.h"
  81 #include "common/cmdparse.h"
  82 #include "include/str_list.h"
  83 #include "include/str_map.h"
  84 #include "include/scope_guard.h"
  85 #include "perfglue/heap_profiler.h"
  86
  87 #include "auth/cephx/CephxKeyServer.h"
  88 #include "osd/OSDCap.h"
  89
  90 #include "json_spirit/json_spirit_reader.h"
  91
  92 #include <boost/algorithm/string/predicate.hpp>
  93
  94 using std::dec;
  95 using std::hex;
  96 using std::list;
  97 using std::map;
  98 using std::make_pair;
  99 using std::ostringstream;
 100 using std::pair;
 101 using std::set;
 102 using std::string;
 103 using std::stringstream;
 104 using std::to_string;
 105 using std::vector;
 106
 107 using ceph::bufferlist;
 108 using ceph::decode;
 109 using ceph::encode;
 110 using ceph::ErasureCodeInterfaceRef;
 111 using ceph::ErasureCodePluginRegistry;
 112 using ceph::ErasureCodeProfile;
 113 using ceph::Formatter;
 114 using ceph::JSONFormatter;
 115 using ceph::make_message;
 116
 117 #define dout_subsys ceph_subsys_mon
 118 static const string OSD_PG_CREATING_PREFIX("osd_pg_creating");
 119 static const string OSD_METADATA_PREFIX("osd_metadata");
 120 static const string OSD_SNAP_PREFIX("osd_snap");
 121
 122 /*
 123
 124   OSD snapshot metadata
 125   ---------------------
 126
 127   -- starting with mimic, removed in octopus --
 128
 129   "removed_epoch_%llu_%08lx" % (pool, epoch)
 130    -> interval_set<snapid_t>
 131
 132   "removed_snap_%llu_%016llx" % (pool, last_snap)
 133    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 134
 135
 136   -- starting with mimic --
 137
 138   "purged_snap_%llu_%016llx" % (pool, last_snap)
 139    -> { first_snap, end_snap, epoch }   (last_snap = end_snap - 1)
 140
 141   - note that the {removed,purged}_snap put the last snap in they key so
 142     that we can use forward iteration only to search for an epoch in an
 143     interval.  e.g., to test if epoch N is removed/purged, we'll find a key
 144     >= N that either does or doesn't contain the given snap.
 145
 146
 147   -- starting with octopus --
 148
 149   "purged_epoch_%08lx" % epoch
 150   -> map<int64_t,interval_set<snapid_t>>
 151
 152   */
 153 using namespace TOPNSPC::common;
 154 namespace {
 155
 156 struct OSDMemCache : public PriorityCache::PriCache {
 157   OSDMonitor *osdmon;
 158   int64_t cache_bytes[PriorityCache::Priority::LAST+1] = {0};
 159   int64_t committed_bytes = 0;
 160   double cache_ratio = 0;
 161
 162   OSDMemCache(OSDMonitor *m) : osdmon(m) {};
 163
 164   virtual uint64_t _get_used_bytes() const = 0;
 165
 166   virtual int64_t request_cache_bytes(
 167       PriorityCache::Priority pri, uint64_t total_cache) const {
 168     int64_t assigned = get_cache_bytes(pri);
 169
 170     switch (pri) {
 171     // All cache items are currently set to have PRI1 priority
 172     case PriorityCache::Priority::PRI1:
 173       {
 174         int64_t request = _get_used_bytes();
 175         return (request > assigned) ? request - assigned : 0;
 176       }
 177     default:
 178       break;
 179     }
 180     return -EOPNOTSUPP;
 181   }
 182
 183   virtual int64_t get_cache_bytes(PriorityCache::Priority pri) const {
 184       return cache_bytes[pri];
 185   }
 186
 187   virtual int64_t get_cache_bytes() const {
 188     int64_t total = 0;
 189
 190     for (int i = 0; i < PriorityCache::Priority::LAST + 1; i++) {
 191       PriorityCache::Priority pri = static_cast<PriorityCache::Priority>(i);
 192       total += get_cache_bytes(pri);
 193     }
 194     return total;
 195   }
 196
 197   virtual void set_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 198     cache_bytes[pri] = bytes;
 199   }
 200   virtual void add_cache_bytes(PriorityCache::Priority pri, int64_t bytes) {
 201     cache_bytes[pri] += bytes;
 202   }
 203   virtual int64_t commit_cache_size(uint64_t total_cache) {
 204     committed_bytes = PriorityCache::get_chunk(
 205         get_cache_bytes(), total_cache);
 206     return committed_bytes;
 207   }
 208   virtual int64_t get_committed_size() const {
 209     return committed_bytes;
 210   }
 211   virtual double get_cache_ratio() const {
 212     return cache_ratio;
 213   }
 214   virtual void set_cache_ratio(double ratio) {
 215     cache_ratio = ratio;
 216   }
 217   virtual string get_cache_name() const = 0;
 218 };
 219
 220 struct IncCache : public OSDMemCache {
 221   IncCache(OSDMonitor *m) : OSDMemCache(m) {};
 222
 223   virtual uint64_t _get_used_bytes() const {
 224     return osdmon->inc_osd_cache.get_bytes();
 225   }
 226
 227   virtual string get_cache_name() const {
 228     return "OSDMap Inc Cache";
 229   }
 230
 231   uint64_t _get_num_osdmaps() const {
 232     return osdmon->inc_osd_cache.get_size();
 233   }
 234 };
 235
 236 struct FullCache : public OSDMemCache {
 237   FullCache(OSDMonitor *m) : OSDMemCache(m) {};
 238
 239   virtual uint64_t _get_used_bytes() const {
 240     return osdmon->full_osd_cache.get_bytes();
 241   }
 242
 243   virtual string get_cache_name() const {
 244     return "OSDMap Full Cache";
 245   }
 246
 247   uint64_t _get_num_osdmaps() const {
 248     return osdmon->full_osd_cache.get_size();
 249   }
 250 };
 251
 252 std::shared_ptr<IncCache> inc_cache;
 253 std::shared_ptr<FullCache> full_cache;
 254
 255 const uint32_t MAX_POOL_APPLICATIONS = 4;
 256 const uint32_t MAX_POOL_APPLICATION_KEYS = 64;
 257 const uint32_t MAX_POOL_APPLICATION_LENGTH = 128;
 258
 259 bool is_osd_writable(const OSDCapGrant& grant, const std::string* pool_name) {
 260   // Note: this doesn't include support for the application tag match
 261   if ((grant.spec.allow & OSD_CAP_W) != 0) {
 262     auto& match = grant.match;
 263     if (match.is_match_all()) {
 264       return true;
 265     } else if (pool_name != nullptr &&
 266                !match.pool_namespace.pool_name.empty() &&
 267                match.pool_namespace.pool_name == *pool_name) {
 268       return true;
 269     }
 270   }
 271   return false;
 272 }
 273
 274 bool is_unmanaged_snap_op_permitted(CephContext* cct,
 275                                     const KeyServer& key_server,
 276                                     const EntityName& entity_name,
 277                                     const MonCap& mon_caps,
 278                                     const entity_addr_t& peer_socket_addr,
 279                                     const std::string* pool_name)
 280 {
 281   typedef std::map<std::string, std::string> CommandArgs;
 282
 283   if (mon_caps.is_capable(
 284         cct, entity_name, "osd",
 285         "osd pool op unmanaged-snap",
 286         (pool_name == nullptr ?
 287          CommandArgs{} /* pool DNE, require unrestricted cap */ :
 288          CommandArgs{{"poolname", *pool_name}}),
 289         false, true, false,
 290         peer_socket_addr)) {
 291     return true;
 292   }
 293
 294   AuthCapsInfo caps_info;
 295   if (!key_server.get_service_caps(entity_name, CEPH_ENTITY_TYPE_OSD,
 296                                    caps_info)) {
 297     dout(10) << "unable to locate OSD cap data for " << entity_name
 298              << " in auth db" << dendl;
 299     return false;
 300   }
 301
 302   string caps_str;
 303   if (caps_info.caps.length() > 0) {
 304     auto p = caps_info.caps.cbegin();
 305     try {
 306       decode(caps_str, p);
 307     } catch (const ceph::buffer::error &err) {
 308       derr << "corrupt OSD cap data for " << entity_name << " in auth db"
 309            << dendl;
 310       return false;
 311     }
 312   }
 313
 314   OSDCap osd_cap;
 315   if (!osd_cap.parse(caps_str, nullptr)) {
 316     dout(10) << "unable to parse OSD cap data for " << entity_name
 317              << " in auth db" << dendl;
 318     return false;
 319   }
 320
 321   // if the entity has write permissions in one or all pools, permit
 322   // usage of unmanaged-snapshots
 323   if (osd_cap.allow_all()) {
 324     return true;
 325   }
 326
 327   for (auto& grant : osd_cap.grants) {
 328     if (grant.profile.is_valid()) {
 329       for (auto& profile_grant : grant.profile_grants) {
 330         if (is_osd_writable(profile_grant, pool_name)) {
 331           return true;
 332         }
 333       }
 334     } else if (is_osd_writable(grant, pool_name)) {
 335       return true;
 336     }
 337   }
 338
 339   return false;
 340 }
 341
 342 } // anonymous namespace
 343
 344 void LastEpochClean::Lec::report(ps_t ps, epoch_t last_epoch_clean)
 345 {
 346   if (epoch_by_pg.size() <= ps) {
 347     epoch_by_pg.resize(ps + 1, 0);
 348   }
 349   const auto old_lec = epoch_by_pg[ps];
 350   if (old_lec >= last_epoch_clean) {
 351     // stale lec
 352     return;
 353   }
 354   epoch_by_pg[ps] = last_epoch_clean;
 355   if (last_epoch_clean < floor) {
 356     floor = last_epoch_clean;
 357   } else if (last_epoch_clean > floor) {
 358     if (old_lec == floor) {
 359       // probably should increase floor?
 360       auto new_floor = std::min_element(std::begin(epoch_by_pg),
 361                                         std::end(epoch_by_pg));
 362       floor = *new_floor;
 363     }
 364   }
 365   if (ps != next_missing) {
 366     return;
 367   }
 368   for (; next_missing < epoch_by_pg.size(); next_missing++) {
 369     if (epoch_by_pg[next_missing] == 0) {
 370       break;
 371     }
 372   }
 373 }
 374
 375 void LastEpochClean::remove_pool(uint64_t pool)
 376 {
 377   report_by_pool.erase(pool);
 378 }
 379
 380 void LastEpochClean::report(const pg_t& pg, epoch_t last_epoch_clean)
 381 {
 382   auto& lec = report_by_pool[pg.pool()];
 383   return lec.report(pg.ps(), last_epoch_clean);
 384 }
 385
 386 epoch_t LastEpochClean::get_lower_bound(const OSDMap& latest) const
 387 {
 388   auto floor = latest.get_epoch();
 389   for (auto& pool : latest.get_pools()) {
 390     auto reported = report_by_pool.find(pool.first);
 391     if (reported == report_by_pool.end()) {
 392       return 0;
 393     }
 394     if (reported->second.next_missing < pool.second.get_pg_num()) {
 395       return 0;
 396     }
 397     if (reported->second.floor < floor) {
 398       floor = reported->second.floor;
 399     }
 400   }
 401   return floor;
 402 }
 403
 404 void LastEpochClean::dump(Formatter *f) const
 405 {
 406   f->open_array_section("per_pool");
 407
 408   for (auto& [pool, lec] : report_by_pool) {
 409     f->open_object_section("pool");
 410     f->dump_unsigned("poolid", pool);
 411     f->dump_unsigned("floor", lec.floor);
 412     f->close_section();
 413   }
 414
 415   f->close_section();
 416 }
 417
 418 class C_UpdateCreatingPGs : public Context {
 419 public:
 420   OSDMonitor *osdmon;
 421   utime_t start;
 422   epoch_t epoch;
 423   C_UpdateCreatingPGs(OSDMonitor *osdmon, epoch_t e) :
 424     osdmon(osdmon), start(ceph_clock_now()), epoch(e) {}
 425   void finish(int r) override {
 426     if (r >= 0) {
 427       utime_t end = ceph_clock_now();
 428       dout(10) << "osdmap epoch " << epoch << " mapping took "
 429                << (end - start) << " seconds" << dendl;
 430       osdmon->update_creating_pgs();
 431       osdmon->check_pg_creates_subs();
 432     }
 433   }
 434 };
 435
 436 #undef dout_prefix
 437 #define dout_prefix _prefix(_dout, mon, osdmap)
 438 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const OSDMap& osdmap) {
 439   return *_dout << "mon." << mon.name << "@" << mon.rank
 440                 << "(" << mon.get_state_name()
 441                 << ").osd e" << osdmap.get_epoch() << " ";
 442 }
 443
 444 OSDMonitor::OSDMonitor(
 445   CephContext *cct,
 446   Monitor &mn,
 447   Paxos &p,
 448   const string& service_name)
 449  : PaxosService(mn, p, service_name),
 450    cct(cct),
 451    inc_osd_cache(g_conf()->mon_osd_cache_size),
 452    full_osd_cache(g_conf()->mon_osd_cache_size),
 453    has_osdmap_manifest(false),
 454    mapper(mn.cct, &mn.cpu_tp)
 455 {
 456   inc_cache = std::make_shared<IncCache>(this);
 457   full_cache = std::make_shared<FullCache>(this);
 458   cct->_conf.add_observer(this);
 459   int r = _set_cache_sizes();
 460   if (r < 0) {
 461     derr << __func__ << " using default osd cache size - mon_osd_cache_size ("
 462          << g_conf()->mon_osd_cache_size
 463          << ") without priority cache management"
 464          << dendl;
 465   }
 466 }
 467
 468 const char **OSDMonitor::get_tracked_conf_keys() const
 469 {
 470   static const char* KEYS[] = {
 471     "mon_memory_target",
 472     "mon_memory_autotune",
 473     "rocksdb_cache_size",
 474     NULL
 475   };
 476   return KEYS;
 477 }
 478
 479 void OSDMonitor::handle_conf_change(const ConfigProxy& conf,
 480                                     const std::set<std::string> &changed)
 481 {
 482   dout(10) << __func__ << " " << changed << dendl;
 483
 484   if (changed.count("mon_memory_autotune")) {
 485     _set_cache_autotuning();
 486   }
 487   if (changed.count("mon_memory_target") ||
 488       changed.count("rocksdb_cache_size")) {
 489     int r = _update_mon_cache_settings();
 490     if (r < 0) {
 491       derr << __func__ << " mon_memory_target:"
 492            << g_conf()->mon_memory_target
 493            << " rocksdb_cache_size:"
 494            << g_conf()->rocksdb_cache_size
 495            << ". Unable to update cache size."
 496            << dendl;
 497     }
 498   }
 499 }
 500
 501 void OSDMonitor::_set_cache_autotuning()
 502 {
 503   if (!g_conf()->mon_memory_autotune && pcm != nullptr) {
 504     // Disable cache autotuning
 505     std::lock_guard l(balancer_lock);
 506     pcm = nullptr;
 507   }
 508
 509   if (g_conf()->mon_memory_autotune && pcm == nullptr) {
 510     int r = register_cache_with_pcm();
 511     if (r < 0) {
 512       dout(10) << __func__
 513                << " Error while registering osdmon caches with pcm."
 514                << " Cache auto tuning not enabled."
 515                << dendl;
 516       mon_memory_autotune = false;
 517     } else {
 518       mon_memory_autotune = true;
 519     }
 520   }
 521 }
 522
 523 int OSDMonitor::_update_mon_cache_settings()
 524 {
 525   if (g_conf()->mon_memory_target <= 0 ||
 526       g_conf()->mon_memory_target < mon_memory_min ||
 527       g_conf()->rocksdb_cache_size <= 0) {
 528     return -EINVAL;
 529   }
 530
 531   if (pcm == nullptr && rocksdb_binned_kv_cache == nullptr) {
 532     derr << __func__ << " not using pcm and rocksdb" << dendl;
 533     return -EINVAL;
 534   }
 535
 536   uint64_t old_mon_memory_target = mon_memory_target;
 537   uint64_t old_rocksdb_cache_size = rocksdb_cache_size;
 538
 539   // Set the new pcm memory cache sizes
 540   mon_memory_target = g_conf()->mon_memory_target;
 541   rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 542
 543   uint64_t base = mon_memory_base;
 544   double fragmentation = mon_memory_fragmentation;
 545   uint64_t target = mon_memory_target;
 546   uint64_t min = mon_memory_min;
 547   uint64_t max = min;
 548
 549   uint64_t ltarget = (1.0 - fragmentation) * target;
 550   if (ltarget > base + min) {
 551     max = ltarget - base;
 552   }
 553
 554   int r = _set_cache_ratios();
 555   if (r < 0) {
 556     derr << __func__ << " Cache ratios for pcm could not be set."
 557          << " Review the kv (rocksdb) and mon_memory_target sizes."
 558          << dendl;
 559     mon_memory_target = old_mon_memory_target;
 560     rocksdb_cache_size = old_rocksdb_cache_size;
 561     return -EINVAL;
 562   }
 563
 564   if (mon_memory_autotune && pcm != nullptr) {
 565     std::lock_guard l(balancer_lock);
 566     // set pcm cache levels
 567     pcm->set_target_memory(target);
 568     pcm->set_min_memory(min);
 569     pcm->set_max_memory(max);
 570     // tune memory based on new values
 571     pcm->tune_memory();
 572     pcm->balance();
 573     _set_new_cache_sizes();
 574     dout(1) << __func__ << " Updated mon cache setting."
 575              << " target: " << target
 576              << " min: " << min
 577              << " max: " << max
 578              << dendl;
 579   }
 580   return 0;
 581 }
 582
 583 int OSDMonitor::_set_cache_sizes()
 584 {
 585   if (g_conf()->mon_memory_autotune) {
 586     // set the new osdmon cache targets to be managed by pcm
 587     mon_osd_cache_size = g_conf()->mon_osd_cache_size;
 588     rocksdb_cache_size = g_conf()->rocksdb_cache_size;
 589     mon_memory_base = cct->_conf.get_val<Option::size_t>("osd_memory_base");
 590     mon_memory_fragmentation = cct->_conf.get_val<double>("osd_memory_expected_fragmentation");
 591     mon_memory_target = g_conf()->mon_memory_target;
 592     mon_memory_min = g_conf()->mon_osd_cache_size_min;
 593     if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 594       derr << __func__ << " mon_memory_target:" << mon_memory_target
 595            << " mon_memory_min:" << mon_memory_min
 596            << ". Invalid size option(s) provided."
 597            << dendl;
 598       return -EINVAL;
 599     }
 600     // Set the initial inc and full LRU cache sizes
 601     inc_osd_cache.set_bytes(mon_memory_min);
 602     full_osd_cache.set_bytes(mon_memory_min);
 603     mon_memory_autotune = g_conf()->mon_memory_autotune;
 604   }
 605   return 0;
 606 }
 607
 608 bool OSDMonitor::_have_pending_crush()
 609 {
 610   return pending_inc.crush.length() > 0;
 611 }
 612
 613 CrushWrapper &OSDMonitor::_get_stable_crush()
 614 {
 615   return *osdmap.crush;
 616 }
 617
 618 void OSDMonitor::_get_pending_crush(CrushWrapper& newcrush)
 619 {
 620   bufferlist bl;
 621   if (pending_inc.crush.length())
 622     bl = pending_inc.crush;
 623   else
 624     osdmap.crush->encode(bl, CEPH_FEATURES_SUPPORTED_DEFAULT);
 625
 626   auto p = bl.cbegin();
 627   newcrush.decode(p);
 628 }
 629
 630 void OSDMonitor::create_initial()
 631 {
 632   dout(10) << "create_initial for " << mon.monmap->fsid << dendl;
 633
 634   OSDMap newmap;
 635
 636   bufferlist bl;
 637   mon.store->get("mkfs", "osdmap", bl);
 638
 639   if (bl.length()) {
 640     newmap.decode(bl);
 641     newmap.set_fsid(mon.monmap->fsid);
 642   } else {
 643     newmap.build_simple(cct, 0, mon.monmap->fsid, 0);
 644   }
 645   newmap.set_epoch(1);
 646   newmap.created = newmap.modified = ceph_clock_now();
 647
 648   // new clusters should sort bitwise by default.
 649   newmap.set_flag(CEPH_OSDMAP_SORTBITWISE);
 650
 651   newmap.flags |=
 652     CEPH_OSDMAP_RECOVERY_DELETES |
 653     CEPH_OSDMAP_PURGED_SNAPDIRS |
 654     CEPH_OSDMAP_PGLOG_HARDLIMIT;
 655   newmap.full_ratio = g_conf()->mon_osd_full_ratio;
 656   if (newmap.full_ratio > 1.0) newmap.full_ratio /= 100;
 657   newmap.backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
 658   if (newmap.backfillfull_ratio > 1.0) newmap.backfillfull_ratio /= 100;
 659   newmap.nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
 660   if (newmap.nearfull_ratio > 1.0) newmap.nearfull_ratio /= 100;
 661
 662   // new cluster should require latest by default
 663   if (g_conf().get_val<bool>("mon_debug_no_require_pacific")) {
 664     if (g_conf().get_val<bool>("mon_debug_no_require_octopus")) {
 665       derr << __func__ << " mon_debug_no_require_pacific and octopus=true" << dendl;
 666       newmap.require_osd_release = ceph_release_t::nautilus;
 667     } else {
 668       derr << __func__ << " mon_debug_no_require_pacific=true" << dendl;
 669       newmap.require_osd_release = ceph_release_t::octopus;
 670     }
 671   } else {
 672     newmap.require_osd_release = ceph_release_t::pacific;
 673   }
 674
 675   if (newmap.require_osd_release >= ceph_release_t::octopus) {
 676     ceph_release_t r = ceph_release_from_name(
 677       g_conf()->mon_osd_initial_require_min_compat_client);
 678     if (!r) {
 679       ceph_abort_msg("mon_osd_initial_require_min_compat_client is not valid");
 680     }
 681     newmap.require_min_compat_client = r;
 682   }
 683
 684   // encode into pending incremental
 685   uint64_t features = newmap.get_encoding_features();
 686   newmap.encode(pending_inc.fullmap,
 687                 features | CEPH_FEATURE_RESERVED);
 688   pending_inc.full_crc = newmap.get_crc();
 689   dout(20) << " full crc " << pending_inc.full_crc << dendl;
 690 }
 691
 692 void OSDMonitor::get_store_prefixes(std::set<string>& s) const
 693 {
 694   s.insert(service_name);
 695   s.insert(OSD_PG_CREATING_PREFIX);
 696   s.insert(OSD_METADATA_PREFIX);
 697   s.insert(OSD_SNAP_PREFIX);
 698 }
 699
 700 void OSDMonitor::update_from_paxos(bool *need_bootstrap)
 701 {
 702   // we really don't care if the version has been updated, because we may
 703   // have trimmed without having increased the last committed; yet, we may
 704   // need to update the in-memory manifest.
 705   load_osdmap_manifest();
 706
 707   version_t version = get_last_committed();
 708   if (version == osdmap.epoch)
 709     return;
 710   ceph_assert(version > osdmap.epoch);
 711
 712   dout(15) << "update_from_paxos paxos e " << version
 713            << ", my e " << osdmap.epoch << dendl;
 714
 715   int prev_num_up_osd = osdmap.num_up_osd;
 716
 717   if (mapping_job) {
 718     if (!mapping_job->is_done()) {
 719       dout(1) << __func__ << " mapping job "
 720               << mapping_job.get() << " did not complete, "
 721               << mapping_job->shards << " left, canceling" << dendl;
 722       mapping_job->abort();
 723     }
 724     mapping_job.reset();
 725   }
 726
 727   load_health();
 728
 729   /*
 730    * We will possibly have a stashed latest that *we* wrote, and we will
 731    * always be sure to have the oldest full map in the first..last range
 732    * due to encode_trim_extra(), which includes the oldest full map in the trim
 733    * transaction.
 734    *
 735    * encode_trim_extra() does not however write the full map's
 736    * version to 'full_latest'.  This is only done when we are building the
 737    * full maps from the incremental versions.  But don't panic!  We make sure
 738    * that the following conditions find whichever full map version is newer.
 739    */
 740   version_t latest_full = get_version_latest_full();
 741   if (latest_full == 0 && get_first_committed() > 1)
 742     latest_full = get_first_committed();
 743
 744   if (get_first_committed() > 1 &&
 745       latest_full < get_first_committed()) {
 746     // the monitor could be just sync'ed with its peer, and the latest_full key
 747     // is not encoded in the paxos commits in encode_pending(), so we need to
 748     // make sure we get it pointing to a proper version.
 749     version_t lc = get_last_committed();
 750     version_t fc = get_first_committed();
 751
 752     dout(10) << __func__ << " looking for valid full map in interval"
 753              << " [" << fc << ", " << lc << "]" << dendl;
 754
 755     latest_full = 0;
 756     for (version_t v = lc; v >= fc; v--) {
 757       string full_key = "full_" + stringify(v);
 758       if (mon.store->exists(get_service_name(), full_key)) {
 759         dout(10) << __func__ << " found latest full map v " << v << dendl;
 760         latest_full = v;
 761         break;
 762       }
 763     }
 764
 765     ceph_assert(latest_full > 0);
 766     auto t(std::make_shared<MonitorDBStore::Transaction>());
 767     put_version_latest_full(t, latest_full);
 768     mon.store->apply_transaction(t);
 769     dout(10) << __func__ << " updated the on-disk full map version to "
 770              << latest_full << dendl;
 771   }
 772
 773   if ((latest_full > 0) && (latest_full > osdmap.epoch)) {
 774     bufferlist latest_bl;
 775     get_version_full(latest_full, latest_bl);
 776     ceph_assert(latest_bl.length() != 0);
 777     dout(7) << __func__ << " loading latest full map e" << latest_full << dendl;
 778     osdmap = OSDMap();
 779     osdmap.decode(latest_bl);
 780   }
 781
 782   bufferlist bl;
 783   if (!mon.store->get(OSD_PG_CREATING_PREFIX, "creating", bl)) {
 784     auto p = bl.cbegin();
 785     std::lock_guard<std::mutex> l(creating_pgs_lock);
 786     creating_pgs.decode(p);
 787     dout(7) << __func__ << " loading creating_pgs last_scan_epoch "
 788             << creating_pgs.last_scan_epoch
 789             << " with " << creating_pgs.pgs.size() << " pgs" << dendl;
 790   } else {
 791     dout(1) << __func__ << " missing creating pgs; upgrade from post-kraken?"
 792             << dendl;
 793   }
 794
 795   // walk through incrementals
 796   MonitorDBStore::TransactionRef t;
 797   size_t tx_size = 0;
 798   while (version > osdmap.epoch) {
 799     bufferlist inc_bl;
 800     int err = get_version(osdmap.epoch+1, inc_bl);
 801     ceph_assert(err == 0);
 802     ceph_assert(inc_bl.length());
 803     // set priority cache manager levels if the osdmap is
 804     // being populated for the first time.
 805     if (mon_memory_autotune && pcm == nullptr) {
 806       int r = register_cache_with_pcm();
 807       if (r < 0) {
 808         dout(10) << __func__
 809                  << " Error while registering osdmon caches with pcm."
 810                  << " Proceeding without cache auto tuning."
 811                  << dendl;
 812       }
 813     }
 814
 815     dout(7) << "update_from_paxos  applying incremental " << osdmap.epoch+1
 816             << dendl;
 817     OSDMap::Incremental inc(inc_bl);
 818     err = osdmap.apply_incremental(inc);
 819     ceph_assert(err == 0);
 820
 821     if (!t)
 822       t.reset(new MonitorDBStore::Transaction);
 823
 824     // Write out the full map for all past epochs.  Encode the full
 825     // map with the same features as the incremental.  If we don't
 826     // know, use the quorum features.  If we don't know those either,
 827     // encode with all features.
 828     uint64_t f = inc.encode_features;
 829     if (!f)
 830       f = mon.get_quorum_con_features();
 831     if (!f)
 832       f = -1;
 833     bufferlist full_bl;
 834     osdmap.encode(full_bl, f | CEPH_FEATURE_RESERVED);
 835     tx_size += full_bl.length();
 836
 837     bufferlist orig_full_bl;
 838     get_version_full(osdmap.epoch, orig_full_bl);
 839     if (orig_full_bl.length()) {
 840       // the primary provided the full map
 841       ceph_assert(inc.have_crc);
 842       if (inc.full_crc != osdmap.crc) {
 843         // This will happen if the mons were running mixed versions in
 844         // the past or some other circumstance made the full encoded
 845         // maps divergent.  Reloading here will bring us back into
 846         // sync with the primary for this and all future maps.  OSDs
 847         // will also be brought back into sync when they discover the
 848         // crc mismatch and request a full map from a mon.
 849         derr << __func__ << " full map CRC mismatch, resetting to canonical"
 850              << dendl;
 851
 852         dout(20) << __func__ << " my (bad) full osdmap:\n";
 853         JSONFormatter jf(true);
 854         jf.dump_object("osdmap", osdmap);
 855         jf.flush(*_dout);
 856         *_dout << "\nhexdump:\n";
 857         full_bl.hexdump(*_dout);
 858         *_dout << dendl;
 859
 860         osdmap = OSDMap();
 861         osdmap.decode(orig_full_bl);
 862
 863         dout(20) << __func__ << " canonical full osdmap:\n";
 864         JSONFormatter jf(true);
 865         jf.dump_object("osdmap", osdmap);
 866         jf.flush(*_dout);
 867         *_dout << "\nhexdump:\n";
 868         orig_full_bl.hexdump(*_dout);
 869         *_dout << dendl;
 870       }
 871     } else {
 872       ceph_assert(!inc.have_crc);
 873       put_version_full(t, osdmap.epoch, full_bl);
 874     }
 875     put_version_latest_full(t, osdmap.epoch);
 876
 877     // share
 878     dout(1) << osdmap << dendl;
 879
 880     if (osdmap.epoch == 1) {
 881       t->erase("mkfs", "osdmap");
 882     }
 883
 884     if (tx_size > g_conf()->mon_sync_max_payload_size*2) {
 885       mon.store->apply_transaction(t);
 886       t = MonitorDBStore::TransactionRef();
 887       tx_size = 0;
 888     }
 889     for (const auto [osd, state] : inc.new_state) {
 890       if (state & CEPH_OSD_UP) {
 891         // could be marked up *or* down, but we're too lazy to check which
 892         last_osd_report.erase(osd);
 893       }
 894       if (state & CEPH_OSD_OUT) {
 895         // could be marked in *or* out, but we can safely drop it
 896         osd_epochs.erase(osd);
 897       }
 898     }
 899     for (const auto [osd, weight] : inc.new_weight) {
 900       if (weight == CEPH_OSD_OUT) {
 901         // manually marked out, so drop it
 902         osd_epochs.erase(osd);
 903       }
 904     }
 905   }
 906
 907   if (t) {
 908     mon.store->apply_transaction(t);
 909   }
 910
 911   bool marked_osd_down = false;
 912   for (int o = 0; o < osdmap.get_max_osd(); o++) {
 913     if (osdmap.is_out(o))
 914       continue;
 915     auto found = down_pending_out.find(o);
 916     if (osdmap.is_down(o)) {
 917       // populate down -> out map
 918       if (found == down_pending_out.end()) {
 919         dout(10) << " adding osd." << o << " to down_pending_out map" << dendl;
 920         down_pending_out[o] = ceph_clock_now();
 921         marked_osd_down = true;
 922       }
 923     } else {
 924       if (found != down_pending_out.end()) {
 925         dout(10) << " removing osd." << o << " from down_pending_out map" << dendl;
 926         down_pending_out.erase(found);
 927       }
 928     }
 929   }
 930   // XXX: need to trim MonSession connected with a osd whose id > max_osd?
 931
 932   check_osdmap_subs();
 933   check_pg_creates_subs();
 934
 935   share_map_with_random_osd();
 936   update_logger();
 937   process_failures();
 938
 939   // make sure our feature bits reflect the latest map
 940   update_msgr_features();
 941
 942   if (!mon.is_leader()) {
 943     // will be called by on_active() on the leader, avoid doing so twice
 944     start_mapping();
 945   }
 946   if (osdmap.stretch_mode_enabled) {
 947     dout(20) << "Stretch mode enabled in this map" << dendl;
 948     mon.maybe_engage_stretch_mode();
 949     if (osdmap.degraded_stretch_mode) {
 950       dout(20) << "Degraded stretch mode set in this map" << dendl;
 951       if (!osdmap.recovering_stretch_mode) {
 952         mon.set_degraded_stretch_mode();
 953         if (prev_num_up_osd < osdmap.num_up_osd &&
 954             (osdmap.num_up_osd / (double)osdmap.num_osd) >
 955             cct->_conf.get_val<double>("mon_stretch_cluster_recovery_ratio")) {
 956           // TODO: This works for 2-site clusters when the OSD maps are appropriately
 957           // trimmed and everything is "normal" but not if you have a lot of out OSDs
 958           // you're ignoring or in some really degenerate failure cases
 959           dout(10) << "Enabling recovery stretch mode in this map" << dendl;
 960           mon.go_recovery_stretch_mode();
 961         }
 962       }
 963     }
 964     if (marked_osd_down &&
 965         (!osdmap.degraded_stretch_mode || osdmap.recovering_stretch_mode)) {
 966       dout(20) << "Checking degraded stretch mode due to osd changes" << dendl;
 967       mon.maybe_go_degraded_stretch_mode();
 968     }
 969     if (osdmap.recovering_stretch_mode && stretch_recovery_triggered.is_zero()) {
 970       stretch_recovery_triggered = ceph_clock_now();
 971     }
 972   }
 973 }
 974
 975 int OSDMonitor::register_cache_with_pcm()
 976 {
 977   if (mon_memory_target <= 0 || mon_memory_min <= 0) {
 978     derr << __func__ << " Invalid memory size specified for mon caches."
 979          << " Caches will not be auto-tuned."
 980          << dendl;
 981     return -EINVAL;
 982   }
 983   uint64_t base = mon_memory_base;
 984   double fragmentation = mon_memory_fragmentation;
 985   // For calculating total target memory, consider rocksdb cache size.
 986   uint64_t target = mon_memory_target;
 987   uint64_t min = mon_memory_min;
 988   uint64_t max = min;
 989
 990   // Apply the same logic as in bluestore to set the max amount
 991   // of memory to use for cache. Assume base memory for OSDMaps
 992   // and then add in some overhead for fragmentation.
 993   uint64_t ltarget = (1.0 - fragmentation) * target;
 994   if (ltarget > base + min) {
 995     max = ltarget - base;
 996   }
 997
 998   rocksdb_binned_kv_cache = mon.store->get_priority_cache();
 999   if (!rocksdb_binned_kv_cache) {
1000     derr << __func__ << " not using rocksdb" << dendl;
1001     return -EINVAL;
1002   }
1003
1004   int r = _set_cache_ratios();
1005   if (r < 0) {
1006     derr << __func__ << " Cache ratios for pcm could not be set."
1007          << " Review the kv (rocksdb) and mon_memory_target sizes."
1008          << dendl;
1009     return -EINVAL;
1010   }
1011
1012   pcm = std::make_shared<PriorityCache::Manager>(
1013       cct, min, max, target, true);
1014   pcm->insert("kv", rocksdb_binned_kv_cache, true);
1015   pcm->insert("inc", inc_cache, true);
1016   pcm->insert("full", full_cache, true);
1017   dout(1) << __func__ << " pcm target: " << target
1018            << " pcm max: " << max
1019            << " pcm min: " << min
1020            << " inc_osd_cache size: " << inc_osd_cache.get_size()
1021            << dendl;
1022   return 0;
1023 }
1024
1025 int OSDMonitor::_set_cache_ratios()
1026 {
1027   double old_cache_kv_ratio = cache_kv_ratio;
1028
1029   // Set the cache ratios for kv(rocksdb), inc and full caches
1030   cache_kv_ratio = (double)rocksdb_cache_size / (double)mon_memory_target;
1031   if (cache_kv_ratio >= 1.0) {
1032     derr << __func__ << " Cache kv ratio (" << cache_kv_ratio
1033          << ") must be in range [0,<1.0]."
1034          << dendl;
1035     cache_kv_ratio = old_cache_kv_ratio;
1036     return -EINVAL;
1037   }
1038   rocksdb_binned_kv_cache->set_cache_ratio(cache_kv_ratio);
1039   cache_inc_ratio = cache_full_ratio = (1.0 - cache_kv_ratio) / 2;
1040   inc_cache->set_cache_ratio(cache_inc_ratio);
1041   full_cache->set_cache_ratio(cache_full_ratio);
1042
1043   dout(1) << __func__ << " kv ratio " << cache_kv_ratio
1044            << " inc ratio " << cache_inc_ratio
1045            << " full ratio " << cache_full_ratio
1046            << dendl;
1047   return 0;
1048 }
1049
1050 void OSDMonitor::start_mapping()
1051 {
1052   // initiate mapping job
1053   if (mapping_job) {
1054     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1055              << dendl;
1056     mapping_job->abort();
1057   }
1058   if (!osdmap.get_pools().empty()) {
1059     auto fin = new C_UpdateCreatingPGs(this, osdmap.get_epoch());
1060     mapping_job = mapping.start_update(osdmap, mapper,
1061                                        g_conf()->mon_osd_mapping_pgs_per_chunk);
1062     dout(10) << __func__ << " started mapping job " << mapping_job.get()
1063              << " at " << fin->start << dendl;
1064     mapping_job->set_finish_event(fin);
1065   } else {
1066     dout(10) << __func__ << " no pools, no mapping job" << dendl;
1067     mapping_job = nullptr;
1068   }
1069 }
1070
1071 void OSDMonitor::update_msgr_features()
1072 {
1073   const int types[] = {
1074     entity_name_t::TYPE_OSD,
1075     entity_name_t::TYPE_CLIENT,
1076     entity_name_t::TYPE_MDS,
1077     entity_name_t::TYPE_MON
1078   };
1079   for (int type : types) {
1080     uint64_t mask;
1081     uint64_t features = osdmap.get_features(type, &mask);
1082     if ((mon.messenger->get_policy(type).features_required & mask) != features) {
1083       dout(0) << "crush map has features " << features << ", adjusting msgr requires" << dendl;
1084       ceph::net::Policy p = mon.messenger->get_policy(type);
1085       p.features_required = (p.features_required & ~mask) | features;
1086       mon.messenger->set_policy(type, p);
1087     }
1088   }
1089 }
1090
1091 void OSDMonitor::on_active()
1092 {
1093   update_logger();
1094
1095   if (mon.is_leader()) {
1096     mon.clog->debug() << "osdmap " << osdmap;
1097     if (!priority_convert) {
1098       // Only do this once at start-up
1099       convert_pool_priorities();
1100       priority_convert = true;
1101     }
1102   } else {
1103     list<MonOpRequestRef> ls;
1104     take_all_failures(ls);
1105     while (!ls.empty()) {
1106       MonOpRequestRef op = ls.front();
1107       op->mark_osdmon_event(__func__);
1108       dispatch(op);
1109       ls.pop_front();
1110     }
1111   }
1112   start_mapping();
1113 }
1114
1115 void OSDMonitor::on_restart()
1116 {
1117   last_osd_report.clear();
1118 }
1119
1120 void OSDMonitor::on_shutdown()
1121 {
1122   dout(10) << __func__ << dendl;
1123   if (mapping_job) {
1124     dout(10) << __func__ << " canceling previous mapping_job " << mapping_job.get()
1125              << dendl;
1126     mapping_job->abort();
1127   }
1128
1129   // discard failure info, waiters
1130   list<MonOpRequestRef> ls;
1131   take_all_failures(ls);
1132   ls.clear();
1133 }
1134
1135 void OSDMonitor::update_logger()
1136 {
1137   dout(10) << "update_logger" << dendl;
1138
1139   mon.cluster_logger->set(l_cluster_num_osd, osdmap.get_num_osds());
1140   mon.cluster_logger->set(l_cluster_num_osd_up, osdmap.get_num_up_osds());
1141   mon.cluster_logger->set(l_cluster_num_osd_in, osdmap.get_num_in_osds());
1142   mon.cluster_logger->set(l_cluster_osd_epoch, osdmap.get_epoch());
1143 }
1144
1145 void OSDMonitor::create_pending()
1146 {
1147   pending_inc = OSDMap::Incremental(osdmap.epoch+1);
1148   pending_inc.fsid = mon.monmap->fsid;
1149   pending_metadata.clear();
1150   pending_metadata_rm.clear();
1151   pending_pseudo_purged_snaps.clear();
1152
1153   dout(10) << "create_pending e " << pending_inc.epoch << dendl;
1154
1155   // safety checks (this shouldn't really happen)
1156   {
1157     if (osdmap.backfillfull_ratio <= 0) {
1158       pending_inc.new_backfillfull_ratio = g_conf()->mon_osd_backfillfull_ratio;
1159       if (pending_inc.new_backfillfull_ratio > 1.0)
1160         pending_inc.new_backfillfull_ratio /= 100;
1161       dout(1) << __func__ << " setting backfillfull_ratio = "
1162               << pending_inc.new_backfillfull_ratio << dendl;
1163     }
1164     if (osdmap.full_ratio <= 0) {
1165       pending_inc.new_full_ratio = g_conf()->mon_osd_full_ratio;
1166       if (pending_inc.new_full_ratio > 1.0)
1167         pending_inc.new_full_ratio /= 100;
1168       dout(1) << __func__ << " setting full_ratio = "
1169               << pending_inc.new_full_ratio << dendl;
1170     }
1171     if (osdmap.nearfull_ratio <= 0) {
1172       pending_inc.new_nearfull_ratio = g_conf()->mon_osd_nearfull_ratio;
1173       if (pending_inc.new_nearfull_ratio > 1.0)
1174         pending_inc.new_nearfull_ratio /= 100;
1175       dout(1) << __func__ << " setting nearfull_ratio = "
1176               << pending_inc.new_nearfull_ratio << dendl;
1177     }
1178   }
1179
1180   // Rewrite CRUSH rule IDs if they are using legacy "ruleset"
1181   // structure.
1182   if (osdmap.crush->has_legacy_rule_ids()) {
1183     CrushWrapper newcrush;
1184     _get_pending_crush(newcrush);
1185
1186     // First, for all pools, work out which rule they really used
1187     // by resolving ruleset to rule.
1188     for (const auto &i : osdmap.get_pools()) {
1189       const auto pool_id = i.first;
1190       const auto &pool = i.second;
1191       int new_rule_id = newcrush.find_rule(pool.crush_rule,
1192                                            pool.type, pool.size);
1193
1194       dout(1) << __func__ << " rewriting pool "
1195               << osdmap.get_pool_name(pool_id) << " crush ruleset "
1196               << pool.crush_rule << " -> rule id " << new_rule_id << dendl;
1197       if (pending_inc.new_pools.count(pool_id) == 0) {
1198         pending_inc.new_pools[pool_id] = pool;
1199       }
1200       pending_inc.new_pools[pool_id].crush_rule = new_rule_id;
1201     }
1202
1203     // Now, go ahead and renumber all the rules so that their
1204     // rule_id field corresponds to their position in the array
1205     auto old_to_new = newcrush.renumber_rules();
1206     dout(1) << __func__ << " Rewrote " << old_to_new << " crush IDs:" << dendl;
1207     for (const auto &i : old_to_new) {
1208       dout(1) << __func__ << " " << i.first << " -> " << i.second << dendl;
1209     }
1210     pending_inc.crush.clear();
1211     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
1212   }
1213 }
1214
1215 creating_pgs_t
1216 OSDMonitor::update_pending_pgs(const OSDMap::Incremental& inc,
1217                                const OSDMap& nextmap)
1218 {
1219   dout(10) << __func__ << dendl;
1220   creating_pgs_t pending_creatings;
1221   {
1222     std::lock_guard<std::mutex> l(creating_pgs_lock);
1223     pending_creatings = creating_pgs;
1224   }
1225   // check for new or old pools
1226   if (pending_creatings.last_scan_epoch < inc.epoch) {
1227     unsigned queued = 0;
1228     queued += scan_for_creating_pgs(osdmap.get_pools(),
1229                                     inc.old_pools,
1230                                     inc.modified,
1231                                     &pending_creatings);
1232     queued += scan_for_creating_pgs(inc.new_pools,
1233                                     inc.old_pools,
1234                                     inc.modified,
1235                                     &pending_creatings);
1236     dout(10) << __func__ << " " << queued << " pools queued" << dendl;
1237     for (auto deleted_pool : inc.old_pools) {
1238       auto removed = pending_creatings.remove_pool(deleted_pool);
1239       dout(10) << __func__ << " " << removed
1240                << " pg removed because containing pool deleted: "
1241                << deleted_pool << dendl;
1242       last_epoch_clean.remove_pool(deleted_pool);
1243     }
1244     // pgmon updates its creating_pgs in check_osd_map() which is called by
1245     // on_active() and check_osd_map() could be delayed if lease expires, so its
1246     // creating_pgs could be stale in comparison with the one of osdmon. let's
1247     // trim them here. otherwise, they will be added back after being erased.
1248     unsigned removed = 0;
1249     for (auto& pg : pending_created_pgs) {
1250       dout(20) << __func__ << " noting created pg " << pg << dendl;
1251       pending_creatings.created_pools.insert(pg.pool());
1252       removed += pending_creatings.pgs.erase(pg);
1253     }
1254     pending_created_pgs.clear();
1255     dout(10) << __func__ << " " << removed
1256              << " pgs removed because they're created" << dendl;
1257     pending_creatings.last_scan_epoch = osdmap.get_epoch();
1258   }
1259
1260   // filter out any pgs that shouldn't exist.
1261   {
1262     auto i = pending_creatings.pgs.begin();
1263     while (i != pending_creatings.pgs.end()) {
1264       if (!nextmap.pg_exists(i->first)) {
1265         dout(10) << __func__ << " removing pg " << i->first
1266                  << " which should not exist" << dendl;
1267         i = pending_creatings.pgs.erase(i);
1268       } else {
1269         ++i;
1270       }
1271     }
1272   }
1273
1274   // process queue
1275   unsigned max = std::max<int64_t>(1, g_conf()->mon_osd_max_creating_pgs);
1276   const auto total = pending_creatings.pgs.size();
1277   while (pending_creatings.pgs.size() < max &&
1278          !pending_creatings.queue.empty()) {
1279     auto p = pending_creatings.queue.begin();
1280     int64_t poolid = p->first;
1281     dout(10) << __func__ << " pool " << poolid
1282              << " created " << p->second.created
1283              << " modified " << p->second.modified
1284              << " [" << p->second.start << "-" << p->second.end << ")"
1285              << dendl;
1286     int64_t n = std::min<int64_t>(max - pending_creatings.pgs.size(),
1287                                   p->second.end - p->second.start);
1288     ps_t first = p->second.start;
1289     ps_t end = first + n;
1290     for (ps_t ps = first; ps < end; ++ps) {
1291       const pg_t pgid{ps, static_cast<uint64_t>(poolid)};
1292       // NOTE: use the *current* epoch as the PG creation epoch so that the
1293       // OSD does not have to generate a long set of PastIntervals.
1294       pending_creatings.pgs.emplace(
1295         pgid,
1296         creating_pgs_t::pg_create_info(inc.epoch,
1297                                        p->second.modified));
1298       dout(10) << __func__ << " adding " << pgid << dendl;
1299     }
1300     p->second.start = end;
1301     if (p->second.done()) {
1302       dout(10) << __func__ << " done with queue for " << poolid << dendl;
1303       pending_creatings.queue.erase(p);
1304     } else {
1305       dout(10) << __func__ << " pool " << poolid
1306                << " now [" << p->second.start << "-" << p->second.end << ")"
1307                << dendl;
1308     }
1309   }
1310   dout(10) << __func__ << " queue remaining: " << pending_creatings.queue.size()
1311            << " pools" << dendl;
1312
1313   if (mon.monmap->min_mon_release >= ceph_release_t::octopus) {
1314     // walk creating pgs' history and past_intervals forward
1315     for (auto& i : pending_creatings.pgs) {
1316       // this mirrors PG::start_peering_interval()
1317       pg_t pgid = i.first;
1318
1319       // this is a bit imprecise, but sufficient?
1320       struct min_size_predicate_t : public IsPGRecoverablePredicate {
1321         const pg_pool_t *pi;
1322         bool operator()(const set<pg_shard_t> &have) const {
1323           return have.size() >= pi->min_size;
1324         }
1325         explicit min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
1326       } min_size_predicate(nextmap.get_pg_pool(pgid.pool()));
1327
1328       vector<int> up, acting;
1329       int up_primary, acting_primary;
1330       nextmap.pg_to_up_acting_osds(
1331         pgid, &up, &up_primary, &acting, &acting_primary);
1332       if (i.second.history.epoch_created == 0) {
1333         // new pg entry, set it up
1334         i.second.up = up;
1335         i.second.acting = acting;
1336         i.second.up_primary = up_primary;
1337         i.second.acting_primary = acting_primary;
1338         i.second.history = pg_history_t(i.second.create_epoch,
1339                                         i.second.create_stamp);
1340         dout(10) << __func__ << "  pg " << pgid << " just added, "
1341                  << " up " << i.second.up
1342                  << " p " << i.second.up_primary
1343                  << " acting " << i.second.acting
1344                  << " p " << i.second.acting_primary
1345                  << " history " << i.second.history
1346                  << " past_intervals " << i.second.past_intervals
1347                  << dendl;
1348      } else {
1349         std::stringstream debug;
1350         if (PastIntervals::check_new_interval(
1351               i.second.acting_primary, acting_primary,
1352               i.second.acting, acting,
1353               i.second.up_primary, up_primary,
1354               i.second.up, up,
1355               i.second.history.same_interval_since,
1356               i.second.history.last_epoch_clean,
1357               &nextmap,
1358               &osdmap,
1359               pgid,
1360               min_size_predicate,
1361               &i.second.past_intervals,
1362               &debug)) {
1363           epoch_t e = inc.epoch;
1364           i.second.history.same_interval_since = e;
1365           if (i.second.up != up) {
1366             i.second.history.same_up_since = e;
1367           }
1368           if (i.second.acting_primary != acting_primary) {
1369             i.second.history.same_primary_since = e;
1370           }
1371           if (pgid.is_split(
1372                 osdmap.get_pg_num(pgid.pool()),
1373                 nextmap.get_pg_num(pgid.pool()),
1374                 nullptr)) {
1375             i.second.history.last_epoch_split = e;
1376           }
1377           dout(10) << __func__ << "  pg " << pgid << " new interval,"
1378                    << " up " << i.second.up << " -> " << up
1379                    << " p " << i.second.up_primary << " -> " << up_primary
1380                    << " acting " << i.second.acting << " -> " << acting
1381                    << " p " << i.second.acting_primary << " -> "
1382                    << acting_primary
1383                    << " history " << i.second.history
1384                    << " past_intervals " << i.second.past_intervals
1385                    << dendl;
1386           dout(20) << "  debug: " << debug.str() << dendl;
1387           i.second.up = up;
1388           i.second.acting = acting;
1389           i.second.up_primary = up_primary;
1390           i.second.acting_primary = acting_primary;
1391         }
1392       }
1393     }
1394   }
1395   dout(10) << __func__
1396            << " " << (pending_creatings.pgs.size() - total)
1397            << "/" << pending_creatings.pgs.size()
1398            << " pgs added from queued pools" << dendl;
1399   return pending_creatings;
1400 }
1401
1402 void OSDMonitor::maybe_prime_pg_temp()
1403 {
1404   bool all = false;
1405   if (pending_inc.crush.length()) {
1406     dout(10) << __func__ << " new crush map, all" << dendl;
1407     all = true;
1408   }
1409
1410   if (!pending_inc.new_up_client.empty()) {
1411     dout(10) << __func__ << " new up osds, all" << dendl;
1412     all = true;
1413   }
1414
1415   // check for interesting OSDs
1416   set<int> osds;
1417   for (auto p = pending_inc.new_state.begin();
1418        !all && p != pending_inc.new_state.end();
1419        ++p) {
1420     if ((p->second & CEPH_OSD_UP) &&
1421         osdmap.is_up(p->first)) {
1422       osds.insert(p->first);
1423     }
1424   }
1425   for (auto p = pending_inc.new_weight.begin();
1426        !all && p != pending_inc.new_weight.end();
1427        ++p) {
1428     if (osdmap.exists(p->first) && p->second < osdmap.get_weight(p->first)) {
1429       // weight reduction
1430       osds.insert(p->first);
1431     } else {
1432       dout(10) << __func__ << " osd." << p->first << " weight increase, all"
1433                << dendl;
1434       all = true;
1435     }
1436   }
1437
1438   if (!all && osds.empty())
1439     return;
1440
1441   if (!all) {
1442     unsigned estimate =
1443       mapping.get_osd_acting_pgs(*osds.begin()).size() * osds.size();
1444     if (estimate > mapping.get_num_pgs() *
1445         g_conf()->mon_osd_prime_pg_temp_max_estimate) {
1446       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1447                << osds.size() << " osds >= "
1448                << g_conf()->mon_osd_prime_pg_temp_max_estimate << " of total "
1449                << mapping.get_num_pgs() << " pgs, all"
1450                << dendl;
1451       all = true;
1452     } else {
1453       dout(10) << __func__ << " estimate " << estimate << " pgs on "
1454                << osds.size() << " osds" << dendl;
1455     }
1456   }
1457
1458   OSDMap next;
1459   next.deepish_copy_from(osdmap);
1460   next.apply_incremental(pending_inc);
1461
1462   if (next.get_pools().empty()) {
1463     dout(10) << __func__ << " no pools, no pg_temp priming" << dendl;
1464   } else if (all) {
1465     PrimeTempJob job(next, this);
1466     mapper.queue(&job, g_conf()->mon_osd_mapping_pgs_per_chunk, {});
1467     if (job.wait_for(g_conf()->mon_osd_prime_pg_temp_max_time)) {
1468       dout(10) << __func__ << " done in " << job.get_duration() << dendl;
1469     } else {
1470       dout(10) << __func__ << " did not finish in "
1471                << g_conf()->mon_osd_prime_pg_temp_max_time
1472                << ", stopping" << dendl;
1473       job.abort();
1474     }
1475   } else {
1476     dout(10) << __func__ << " " << osds.size() << " interesting osds" << dendl;
1477     utime_t stop = ceph_clock_now();
1478     stop += g_conf()->mon_osd_prime_pg_temp_max_time;
1479     const int chunk = 1000;
1480     int n = chunk;
1481     std::unordered_set<pg_t> did_pgs;
1482     for (auto osd : osds) {
1483       auto& pgs = mapping.get_osd_acting_pgs(osd);
1484       dout(20) << __func__ << " osd." << osd << " " << pgs << dendl;
1485       for (auto pgid : pgs) {
1486         if (!did_pgs.insert(pgid).second) {
1487           continue;
1488         }
1489         prime_pg_temp(next, pgid);
1490         if (--n <= 0) {
1491           n = chunk;
1492           if (ceph_clock_now() > stop) {
1493             dout(10) << __func__ << " consumed more than "
1494                      << g_conf()->mon_osd_prime_pg_temp_max_time
1495                      << " seconds, stopping"
1496                      << dendl;
1497             return;
1498           }
1499         }
1500       }
1501     }
1502   }
1503 }
1504
1505 void OSDMonitor::prime_pg_temp(
1506   const OSDMap& next,
1507   pg_t pgid)
1508 {
1509   // TODO: remove this creating_pgs direct access?
1510   if (creating_pgs.pgs.count(pgid)) {
1511     return;
1512   }
1513   if (!osdmap.pg_exists(pgid)) {
1514     return;
1515   }
1516
1517   vector<int> up, acting;
1518   mapping.get(pgid, &up, nullptr, &acting, nullptr);
1519
1520   vector<int> next_up, next_acting;
1521   int next_up_primary, next_acting_primary;
1522   next.pg_to_up_acting_osds(pgid, &next_up, &next_up_primary,
1523                             &next_acting, &next_acting_primary);
1524   if (acting == next_acting &&
1525       !(up != acting && next_up == next_acting))
1526     return;  // no change since last epoch
1527
1528   if (acting.empty())
1529     return;  // if previously empty now we can be no worse off
1530   const pg_pool_t *pool = next.get_pg_pool(pgid.pool());
1531   if (pool && acting.size() < pool->min_size)
1532     return;  // can be no worse off than before
1533
1534   if (next_up == next_acting) {
1535     acting.clear();
1536     dout(20) << __func__ << " next_up == next_acting now, clear pg_temp"
1537              << dendl;
1538   }
1539
1540   dout(20) << __func__ << " " << pgid << " " << up << "/" << acting
1541            << " -> " << next_up << "/" << next_acting
1542            << ", priming " << acting
1543            << dendl;
1544   {
1545     std::lock_guard l(prime_pg_temp_lock);
1546     // do not touch a mapping if a change is pending
1547     pending_inc.new_pg_temp.emplace(
1548       pgid,
1549       mempool::osdmap::vector<int>(acting.begin(), acting.end()));
1550   }
1551 }
1552
1553 /**
1554  * @note receiving a transaction in this function gives a fair amount of
1555  * freedom to the service implementation if it does need it. It shouldn't.
1556  */
1557 void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
1558 {
1559   dout(10) << "encode_pending e " << pending_inc.epoch
1560            << dendl;
1561
1562   if (do_prune(t)) {
1563     dout(1) << __func__ << " osdmap full prune encoded e"
1564             << pending_inc.epoch << dendl;
1565   }
1566
1567   // finalize up pending_inc
1568   pending_inc.modified = ceph_clock_now();
1569
1570   int r = pending_inc.propagate_base_properties_to_tiers(cct, osdmap);
1571   ceph_assert(r == 0);
1572
1573   if (mapping_job) {
1574     if (!mapping_job->is_done()) {
1575       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1576               << mapping_job.get() << " did not complete, "
1577               << mapping_job->shards << " left" << dendl;
1578       mapping_job->abort();
1579     } else if (mapping.get_epoch() < osdmap.get_epoch()) {
1580       dout(1) << __func__ << " skipping prime_pg_temp; mapping job "
1581               << mapping_job.get() << " is prior epoch "
1582               << mapping.get_epoch() << dendl;
1583     } else {
1584       if (g_conf()->mon_osd_prime_pg_temp) {
1585         maybe_prime_pg_temp();
1586       }
1587     }
1588   } else if (g_conf()->mon_osd_prime_pg_temp) {
1589     dout(1) << __func__ << " skipping prime_pg_temp; mapping job did not start"
1590             << dendl;
1591   }
1592   mapping_job.reset();
1593
1594   // ensure we don't have blank new_state updates.  these are interrpeted as
1595   // CEPH_OSD_UP (and almost certainly not what we want!).
1596   auto p = pending_inc.new_state.begin();
1597   while (p != pending_inc.new_state.end()) {
1598     if (p->second == 0) {
1599       dout(10) << "new_state for osd." << p->first << " is 0, removing" << dendl;
1600       p = pending_inc.new_state.erase(p);
1601     } else {
1602       if (p->second & CEPH_OSD_UP) {
1603         pending_inc.new_last_up_change = pending_inc.modified;
1604       }
1605       ++p;
1606     }
1607   }
1608   if (!pending_inc.new_up_client.empty()) {
1609     pending_inc.new_last_up_change = pending_inc.modified;
1610   }
1611   for (auto& i : pending_inc.new_weight) {
1612     if (i.first >= osdmap.max_osd) {
1613       if (i.second) {
1614         // new osd is already marked in
1615         pending_inc.new_last_in_change = pending_inc.modified;
1616         break;
1617       }
1618     } else if (!!i.second != !!osdmap.osd_weight[i.first]) {
1619       // existing osd marked in or out
1620       pending_inc.new_last_in_change = pending_inc.modified;
1621       break;
1622     }
1623   }
1624
1625   {
1626     OSDMap tmp;
1627     tmp.deepish_copy_from(osdmap);
1628     tmp.apply_incremental(pending_inc);
1629
1630     // clean pg_temp mappings
1631     OSDMap::clean_temps(cct, osdmap, tmp, &pending_inc);
1632
1633     // clean inappropriate pg_upmap/pg_upmap_items (if any)
1634     {
1635       // check every upmapped pg for now
1636       // until we could reliably identify certain cases to ignore,
1637       // which is obviously the hard part TBD..
1638       vector<pg_t> pgs_to_check;
1639       tmp.get_upmap_pgs(&pgs_to_check);
1640       if (pgs_to_check.size() <
1641           static_cast<uint64_t>(g_conf()->mon_clean_pg_upmaps_per_chunk * 2)) {
1642         // not enough pgs, do it inline
1643         tmp.clean_pg_upmaps(cct, &pending_inc);
1644       } else {
1645         CleanUpmapJob job(cct, tmp, pending_inc);
1646         mapper.queue(&job, g_conf()->mon_clean_pg_upmaps_per_chunk, pgs_to_check);
1647         job.wait();
1648       }
1649     }
1650
1651     // update creating pgs first so that we can remove the created pgid and
1652     // process the pool flag removal below in the same osdmap epoch.
1653     auto pending_creatings = update_pending_pgs(pending_inc, tmp);
1654     bufferlist creatings_bl;
1655     uint64_t features = CEPH_FEATURES_ALL;
1656     if (mon.monmap->min_mon_release < ceph_release_t::octopus) {
1657       dout(20) << __func__ << " encoding pending pgs without octopus features"
1658                << dendl;
1659       features &= ~CEPH_FEATURE_SERVER_OCTOPUS;
1660     }
1661     encode(pending_creatings, creatings_bl, features);
1662     t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
1663
1664     // remove any old (or incompat) POOL_CREATING flags
1665     for (auto& i : tmp.get_pools()) {
1666       if (tmp.require_osd_release < ceph_release_t::nautilus) {
1667         // pre-nautilus OSDMaps shouldn't get this flag.
1668         if (pending_inc.new_pools.count(i.first)) {
1669           pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1670         }
1671       }
1672       if (i.second.has_flag(pg_pool_t::FLAG_CREATING) &&
1673           !pending_creatings.still_creating_pool(i.first)) {
1674         dout(10) << __func__ << " done creating pool " << i.first
1675                  << ", clearing CREATING flag" << dendl;
1676         if (pending_inc.new_pools.count(i.first) == 0) {
1677           pending_inc.new_pools[i.first] = i.second;
1678         }
1679         pending_inc.new_pools[i.first].flags &= ~pg_pool_t::FLAG_CREATING;
1680       }
1681     }
1682
1683     // collect which pools are currently affected by
1684     // the near/backfill/full osd(s),
1685     // and set per-pool near/backfill/full flag instead
1686     set<int64_t> full_pool_ids;
1687     set<int64_t> backfillfull_pool_ids;
1688     set<int64_t> nearfull_pool_ids;
1689     tmp.get_full_pools(cct,
1690                        &full_pool_ids,
1691                        &backfillfull_pool_ids,
1692                          &nearfull_pool_ids);
1693     if (full_pool_ids.empty() ||
1694         backfillfull_pool_ids.empty() ||
1695         nearfull_pool_ids.empty()) {
1696       // normal case - no nearfull, backfillfull or full osds
1697         // try cancel any improper nearfull/backfillfull/full pool
1698         // flags first
1699       for (auto &pool: tmp.get_pools()) {
1700         auto p = pool.first;
1701         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL) &&
1702             nearfull_pool_ids.empty()) {
1703           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1704                    << "'s nearfull flag" << dendl;
1705           if (pending_inc.new_pools.count(p) == 0) {
1706             // load original pool info first!
1707             pending_inc.new_pools[p] = pool.second;
1708           }
1709           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1710         }
1711         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL) &&
1712             backfillfull_pool_ids.empty()) {
1713           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1714                    << "'s backfillfull flag" << dendl;
1715           if (pending_inc.new_pools.count(p) == 0) {
1716             pending_inc.new_pools[p] = pool.second;
1717           }
1718           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1719         }
1720         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) &&
1721             full_pool_ids.empty()) {
1722           if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1723             // set by EQUOTA, skipping
1724             continue;
1725           }
1726           dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1727                    << "'s full flag" << dendl;
1728           if (pending_inc.new_pools.count(p) == 0) {
1729             pending_inc.new_pools[p] = pool.second;
1730           }
1731           pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1732         }
1733       }
1734     }
1735     if (!full_pool_ids.empty()) {
1736       dout(10) << __func__ << " marking pool(s) " << full_pool_ids
1737                << " as full" << dendl;
1738       for (auto &p: full_pool_ids) {
1739         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL)) {
1740           continue;
1741         }
1742         if (pending_inc.new_pools.count(p) == 0) {
1743           pending_inc.new_pools[p] = tmp.pools[p];
1744         }
1745         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_FULL;
1746         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1747         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1748       }
1749       // cancel FLAG_FULL for pools which are no longer full too
1750       for (auto &pool: tmp.get_pools()) {
1751         auto p = pool.first;
1752         if (full_pool_ids.count(p)) {
1753           // skip pools we have just marked as full above
1754           continue;
1755         }
1756         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL) ||
1757             tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1758           // don't touch if currently is not full
1759           // or is running out of quota (and hence considered as full)
1760           continue;
1761         }
1762         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1763                  << "'s full flag" << dendl;
1764         if (pending_inc.new_pools.count(p) == 0) {
1765           pending_inc.new_pools[p] = pool.second;
1766         }
1767         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_FULL;
1768       }
1769     }
1770     if (!backfillfull_pool_ids.empty()) {
1771       for (auto &p: backfillfull_pool_ids) {
1772         if (full_pool_ids.count(p)) {
1773           // skip pools we have already considered as full above
1774           continue;
1775         }
1776         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1777           // make sure FLAG_FULL is truly set, so we are safe not
1778           // to set a extra (redundant) FLAG_BACKFILLFULL flag
1779           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1780           continue;
1781         }
1782         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1783           // don't bother if pool is already marked as backfillfull
1784           continue;
1785         }
1786         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1787                  << "'s as backfillfull" << dendl;
1788         if (pending_inc.new_pools.count(p) == 0) {
1789           pending_inc.new_pools[p] = tmp.pools[p];
1790         }
1791         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_BACKFILLFULL;
1792         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1793       }
1794       // cancel FLAG_BACKFILLFULL for pools
1795       // which are no longer backfillfull too
1796       for (auto &pool: tmp.get_pools()) {
1797         auto p = pool.first;
1798         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1799           // skip pools we have just marked as backfillfull/full above
1800           continue;
1801         }
1802         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_BACKFILLFULL)) {
1803           // and don't touch if currently is not backfillfull
1804           continue;
1805         }
1806         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1807                  << "'s backfillfull flag" << dendl;
1808         if (pending_inc.new_pools.count(p) == 0) {
1809           pending_inc.new_pools[p] = pool.second;
1810         }
1811         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_BACKFILLFULL;
1812       }
1813     }
1814     if (!nearfull_pool_ids.empty()) {
1815       for (auto &p: nearfull_pool_ids) {
1816         if (full_pool_ids.count(p) || backfillfull_pool_ids.count(p)) {
1817           continue;
1818         }
1819         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
1820           // make sure FLAG_FULL is truly set, so we are safe not
1821           // to set a extra (redundant) FLAG_NEARFULL flag
1822           ceph_assert(tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_FULL));
1823           continue;
1824         }
1825         if (tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1826           // don't bother if pool is already marked as nearfull
1827           continue;
1828         }
1829         dout(10) << __func__ << " marking pool '" << tmp.pool_name[p]
1830                  << "'s as nearfull" << dendl;
1831         if (pending_inc.new_pools.count(p) == 0) {
1832           pending_inc.new_pools[p] = tmp.pools[p];
1833         }
1834         pending_inc.new_pools[p].flags |= pg_pool_t::FLAG_NEARFULL;
1835       }
1836       // cancel FLAG_NEARFULL for pools
1837       // which are no longer nearfull too
1838       for (auto &pool: tmp.get_pools()) {
1839         auto p = pool.first;
1840         if (full_pool_ids.count(p) ||
1841             backfillfull_pool_ids.count(p) ||
1842             nearfull_pool_ids.count(p)) {
1843           // skip pools we have just marked as
1844           // nearfull/backfillfull/full above
1845           continue;
1846         }
1847         if (!tmp.get_pg_pool(p)->has_flag(pg_pool_t::FLAG_NEARFULL)) {
1848           // and don't touch if currently is not nearfull
1849           continue;
1850         }
1851         dout(10) << __func__ << " clearing pool '" << tmp.pool_name[p]
1852                  << "'s nearfull flag" << dendl;
1853         if (pending_inc.new_pools.count(p) == 0) {
1854           pending_inc.new_pools[p] = pool.second;
1855         }
1856         pending_inc.new_pools[p].flags &= ~pg_pool_t::FLAG_NEARFULL;
1857       }
1858     }
1859
1860     // min_compat_client?
1861     if (!tmp.require_min_compat_client) {
1862       auto mv = tmp.get_min_compat_client();
1863       dout(1) << __func__ << " setting require_min_compat_client to currently "
1864               << "required " << mv << dendl;
1865       mon.clog->info() << "setting require_min_compat_client to currently "
1866                         << "required " << mv;
1867       pending_inc.new_require_min_compat_client = mv;
1868     }
1869
1870     if (osdmap.require_osd_release < ceph_release_t::nautilus &&
1871         tmp.require_osd_release >= ceph_release_t::nautilus) {
1872       dout(10) << __func__ << " first nautilus+ epoch" << dendl;
1873       // add creating flags?
1874       for (auto& i : tmp.get_pools()) {
1875         if (pending_creatings.still_creating_pool(i.first)) {
1876           dout(10) << __func__ << " adding CREATING flag to pool " << i.first
1877                    << dendl;
1878           if (pending_inc.new_pools.count(i.first) == 0) {
1879             pending_inc.new_pools[i.first] = i.second;
1880           }
1881           pending_inc.new_pools[i.first].flags |= pg_pool_t::FLAG_CREATING;
1882         }
1883       }
1884       // adjust blocklist items to all be TYPE_ANY
1885       for (auto& i : tmp.blocklist) {
1886         auto a = i.first;
1887         a.set_type(entity_addr_t::TYPE_ANY);
1888         pending_inc.new_blocklist[a] = i.second;
1889         pending_inc.old_blocklist.push_back(i.first);
1890       }
1891     }
1892
1893     if (osdmap.require_osd_release < ceph_release_t::octopus &&
1894         tmp.require_osd_release >= ceph_release_t::octopus) {
1895       dout(10) << __func__ << " first octopus+ epoch" << dendl;
1896
1897       // adjust obsoleted cache modes
1898       for (auto& [poolid, pi] : tmp.pools) {
1899         if (pi.cache_mode == pg_pool_t::CACHEMODE_FORWARD) {
1900           if (pending_inc.new_pools.count(poolid) == 0) {
1901             pending_inc.new_pools[poolid] = pi;
1902           }
1903           dout(10) << __func__ << " switching pool " << poolid
1904                    << " cachemode from forward -> proxy" << dendl;
1905           pending_inc.new_pools[poolid].cache_mode = pg_pool_t::CACHEMODE_PROXY;
1906         }
1907         if (pi.cache_mode == pg_pool_t::CACHEMODE_READFORWARD) {
1908           if (pending_inc.new_pools.count(poolid) == 0) {
1909             pending_inc.new_pools[poolid] = pi;
1910           }
1911           dout(10) << __func__ << " switching pool " << poolid
1912                    << " cachemode from readforward -> readproxy" << dendl;
1913           pending_inc.new_pools[poolid].cache_mode =
1914             pg_pool_t::CACHEMODE_READPROXY;
1915         }
1916       }
1917
1918       // clear removed_snaps for every pool
1919       for (auto& [poolid, pi] : tmp.pools) {
1920         if (pi.removed_snaps.empty()) {
1921           continue;
1922         }
1923         if (pending_inc.new_pools.count(poolid) == 0) {
1924           pending_inc.new_pools[poolid] = pi;
1925         }
1926         dout(10) << __func__ << " clearing pool " << poolid << " removed_snaps"
1927                  << dendl;
1928         pending_inc.new_pools[poolid].removed_snaps.clear();
1929       }
1930
1931       // create a combined purged snap epoch key for all purged snaps
1932       // prior to this epoch, and store it in the current epoch (i.e.,
1933       // the last pre-octopus epoch, just prior to the one we're
1934       // encoding now).
1935       auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
1936       it->lower_bound("purged_snap_");
1937       map<int64_t,snap_interval_set_t> combined;
1938       while (it->valid()) {
1939         if (it->key().find("purged_snap_") != 0) {
1940           break;
1941         }
1942         string k = it->key();
1943         long long unsigned pool;
1944         int n = sscanf(k.c_str(), "purged_snap_%llu_", &pool);
1945         if (n != 1) {
1946           derr << __func__ << " invalid purged_snaps key '" << k << "'" << dendl;
1947         } else {
1948           bufferlist v = it->value();
1949           auto p = v.cbegin();
1950           snapid_t begin, end;
1951           ceph::decode(begin, p);
1952           ceph::decode(end, p);
1953           combined[pool].insert(begin, end - begin);
1954         }
1955         it->next();
1956       }
1957       if (!combined.empty()) {
1958         string k = make_purged_snap_epoch_key(pending_inc.epoch - 1);
1959         bufferlist v;
1960         ceph::encode(combined, v);
1961         t->put(OSD_SNAP_PREFIX, k, v);
1962         dout(10) << __func__ << " recording pre-octopus purged_snaps in epoch "
1963                  << (pending_inc.epoch - 1) << ", " << v.length() << " bytes"
1964                  << dendl;
1965       } else {
1966         dout(10) << __func__ << " there were no pre-octopus purged snaps"
1967                  << dendl;
1968       }
1969
1970       // clean out the old removed_snap_ and removed_epoch keys
1971       // ('`' is ASCII '_' + 1)
1972       t->erase_range(OSD_SNAP_PREFIX, "removed_snap_", "removed_snap`");
1973       t->erase_range(OSD_SNAP_PREFIX, "removed_epoch_", "removed_epoch`");
1974     }
1975   }
1976
1977   // tell me about it
1978   for (auto i = pending_inc.new_state.begin();
1979        i != pending_inc.new_state.end();
1980        ++i) {
1981     int s = i->second ? i->second : CEPH_OSD_UP;
1982     if (s & CEPH_OSD_UP) {
1983       dout(2) << " osd." << i->first << " DOWN" << dendl;
1984       // Reset laggy parameters if failure interval exceeds a threshold.
1985       const osd_xinfo_t& xi = osdmap.get_xinfo(i->first);
1986       if ((xi.laggy_probability || xi.laggy_interval) && xi.down_stamp.sec()) {
1987         int last_failure_interval = pending_inc.modified.sec() - xi.down_stamp.sec();
1988         if (grace_interval_threshold_exceeded(last_failure_interval)) {
1989           set_default_laggy_params(i->first);
1990         }
1991       }
1992     }
1993     if (s & CEPH_OSD_EXISTS)
1994       dout(2) << " osd." << i->first << " DNE" << dendl;
1995   }
1996   for (auto i = pending_inc.new_up_client.begin();
1997        i != pending_inc.new_up_client.end();
1998        ++i) {
1999     //FIXME: insert cluster addresses too
2000     dout(2) << " osd." << i->first << " UP " << i->second << dendl;
2001   }
2002   for (map<int32_t,uint32_t>::iterator i = pending_inc.new_weight.begin();
2003        i != pending_inc.new_weight.end();
2004        ++i) {
2005     if (i->second == CEPH_OSD_OUT) {
2006       dout(2) << " osd." << i->first << " OUT" << dendl;
2007     } else if (i->second == CEPH_OSD_IN) {
2008       dout(2) << " osd." << i->first << " IN" << dendl;
2009     } else {
2010       dout(2) << " osd." << i->first << " WEIGHT " << hex << i->second << dec << dendl;
2011     }
2012   }
2013
2014   // features for osdmap and its incremental
2015   uint64_t features;
2016
2017   // encode full map and determine its crc
2018   OSDMap tmp;
2019   {
2020     tmp.deepish_copy_from(osdmap);
2021     tmp.apply_incremental(pending_inc);
2022
2023     // determine appropriate features
2024     features = tmp.get_encoding_features();
2025     dout(10) << __func__ << " encoding full map with "
2026              << tmp.require_osd_release
2027              << " features " << features << dendl;
2028
2029     // the features should be a subset of the mon quorum's features!
2030     ceph_assert((features & ~mon.get_quorum_con_features()) == 0);
2031
2032     bufferlist fullbl;
2033     encode(tmp, fullbl, features | CEPH_FEATURE_RESERVED);
2034     pending_inc.full_crc = tmp.get_crc();
2035
2036     // include full map in the txn.  note that old monitors will
2037     // overwrite this.  new ones will now skip the local full map
2038     // encode and reload from this.
2039     put_version_full(t, pending_inc.epoch, fullbl);
2040   }
2041
2042   // encode
2043   ceph_assert(get_last_committed() + 1 == pending_inc.epoch);
2044   bufferlist bl;
2045   encode(pending_inc, bl, features | CEPH_FEATURE_RESERVED);
2046
2047   dout(20) << " full_crc " << tmp.get_crc()
2048            << " inc_crc " << pending_inc.inc_crc << dendl;
2049
2050   /* put everything in the transaction */
2051   put_version(t, pending_inc.epoch, bl);
2052   put_last_committed(t, pending_inc.epoch);
2053
2054   // metadata, too!
2055   for (map<int,bufferlist>::iterator p = pending_metadata.begin();
2056        p != pending_metadata.end();
2057        ++p)
2058     t->put(OSD_METADATA_PREFIX, stringify(p->first), p->second);
2059   for (set<int>::iterator p = pending_metadata_rm.begin();
2060        p != pending_metadata_rm.end();
2061        ++p)
2062     t->erase(OSD_METADATA_PREFIX, stringify(*p));
2063   pending_metadata.clear();
2064   pending_metadata_rm.clear();
2065
2066   // purged_snaps
2067   if (tmp.require_osd_release >= ceph_release_t::octopus &&
2068       !pending_inc.new_purged_snaps.empty()) {
2069     // all snaps purged this epoch (across all pools)
2070     string k = make_purged_snap_epoch_key(pending_inc.epoch);
2071     bufferlist v;
2072     encode(pending_inc.new_purged_snaps, v);
2073     t->put(OSD_SNAP_PREFIX, k, v);
2074   }
2075   for (auto& i : pending_inc.new_purged_snaps) {
2076     for (auto q = i.second.begin();
2077          q != i.second.end();
2078          ++q) {
2079       insert_purged_snap_update(i.first, q.get_start(), q.get_end(),
2080                                 pending_inc.epoch,
2081                                 t);
2082     }
2083   }
2084   for (auto& [pool, snaps] : pending_pseudo_purged_snaps) {
2085     for (auto snap : snaps) {
2086       insert_purged_snap_update(pool, snap, snap + 1,
2087                                 pending_inc.epoch,
2088                                 t);
2089     }
2090   }
2091
2092   // health
2093   health_check_map_t next;
2094   tmp.check_health(cct, &next);
2095   encode_health(next, t);
2096 }
2097
2098 int OSDMonitor::load_metadata(int osd, map<string, string>& m, ostream *err)
2099 {
2100   bufferlist bl;
2101   int r = mon.store->get(OSD_METADATA_PREFIX, stringify(osd), bl);
2102   if (r < 0)
2103     return r;
2104   try {
2105     auto p = bl.cbegin();
2106     decode(m, p);
2107   }
2108   catch (ceph::buffer::error& e) {
2109     if (err)
2110       *err << "osd." << osd << " metadata is corrupt";
2111     return -EIO;
2112   }
2113   return 0;
2114 }
2115
2116 void OSDMonitor::count_metadata(const string& field, map<string,int> *out)
2117 {
2118   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2119     if (osdmap.is_up(osd)) {
2120       map<string,string> meta;
2121       load_metadata(osd, meta, nullptr);
2122       auto p = meta.find(field);
2123       if (p == meta.end()) {
2124         (*out)["unknown"]++;
2125       } else {
2126         (*out)[p->second]++;
2127       }
2128     }
2129   }
2130 }
2131
2132 void OSDMonitor::count_metadata(const string& field, Formatter *f)
2133 {
2134   map<string,int> by_val;
2135   count_metadata(field, &by_val);
2136   f->open_object_section(field.c_str());
2137   for (auto& p : by_val) {
2138     f->dump_int(p.first.c_str(), p.second);
2139   }
2140   f->close_section();
2141 }
2142
2143 void OSDMonitor::get_versions(std::map<string, list<string>> &versions)
2144 {
2145   for (int osd = 0; osd < osdmap.get_max_osd(); ++osd) {
2146     if (osdmap.is_up(osd)) {
2147       map<string,string> meta;
2148       load_metadata(osd, meta, nullptr);
2149       auto p = meta.find("ceph_version_short");
2150       if (p == meta.end()) continue;
2151       versions[p->second].push_back(string("osd.") + stringify(osd));
2152     }
2153   }
2154 }
2155
2156 int OSDMonitor::get_osd_objectstore_type(int osd, string *type)
2157 {
2158   map<string, string> metadata;
2159   int r = load_metadata(osd, metadata, nullptr);
2160   if (r < 0)
2161     return r;
2162
2163   auto it = metadata.find("osd_objectstore");
2164   if (it == metadata.end())
2165     return -ENOENT;
2166   *type = it->second;
2167   return 0;
2168 }
2169
2170 bool OSDMonitor::is_pool_currently_all_bluestore(int64_t pool_id,
2171                                                  const pg_pool_t &pool,
2172                                                  ostream *err)
2173 {
2174   // just check a few pgs for efficiency - this can't give a guarantee anyway,
2175   // since filestore osds could always join the pool later
2176   set<int> checked_osds;
2177   for (unsigned ps = 0; ps < std::min(8u, pool.get_pg_num()); ++ps) {
2178     vector<int> up, acting;
2179     pg_t pgid(ps, pool_id);
2180     osdmap.pg_to_up_acting_osds(pgid, up, acting);
2181     for (int osd : up) {
2182       if (checked_osds.find(osd) != checked_osds.end())
2183         continue;
2184       string objectstore_type;
2185       int r = get_osd_objectstore_type(osd, &objectstore_type);
2186       // allow with missing metadata, e.g. due to an osd never booting yet
2187       if (r < 0 || objectstore_type == "bluestore") {
2188         checked_osds.insert(osd);
2189         continue;
2190       }
2191       *err << "osd." << osd << " uses " << objectstore_type;
2192       return false;
2193     }
2194   }
2195   return true;
2196 }
2197
2198 int OSDMonitor::dump_osd_metadata(int osd, Formatter *f, ostream *err)
2199 {
2200   map<string,string> m;
2201   if (int r = load_metadata(osd, m, err))
2202     return r;
2203   for (map<string,string>::iterator p = m.begin(); p != m.end(); ++p)
2204     f->dump_string(p->first.c_str(), p->second);
2205   return 0;
2206 }
2207
2208 void OSDMonitor::print_nodes(Formatter *f)
2209 {
2210   // group OSDs by their hosts
2211   map<string, list<int> > osds; // hostname => osd
2212   for (int osd = 0; osd < osdmap.get_max_osd(); osd++) {
2213     map<string, string> m;
2214     if (load_metadata(osd, m, NULL)) {
2215       continue;
2216     }
2217     map<string, string>::iterator hostname = m.find("hostname");
2218     if (hostname == m.end()) {
2219       // not likely though
2220       continue;
2221     }
2222     osds[hostname->second].push_back(osd);
2223   }
2224
2225   dump_services(f, osds, "osd");
2226 }
2227
2228 void OSDMonitor::share_map_with_random_osd()
2229 {
2230   if (osdmap.get_num_up_osds() == 0) {
2231     dout(10) << __func__ << " no up osds, don't share with anyone" << dendl;
2232     return;
2233   }
2234
2235   MonSession *s = mon.session_map.get_random_osd_session(&osdmap);
2236   if (!s) {
2237     dout(10) << __func__ << " no up osd on our session map" << dendl;
2238     return;
2239   }
2240
2241   dout(10) << "committed, telling random " << s->name
2242            << " all about it" << dendl;
2243
2244   // get feature of the peer
2245   // use quorum_con_features, if it's an anonymous connection.
2246   uint64_t features = s->con_features ? s->con_features :
2247                                         mon.get_quorum_con_features();
2248   // whatev, they'll request more if they need it
2249   MOSDMap *m = build_incremental(osdmap.get_epoch() - 1, osdmap.get_epoch(), features);
2250   s->con->send_message(m);
2251   // NOTE: do *not* record osd has up to this epoch (as we do
2252   // elsewhere) as they may still need to request older values.
2253 }
2254
2255 version_t OSDMonitor::get_trim_to() const
2256 {
2257   if (mon.get_quorum().empty()) {
2258     dout(10) << __func__ << " quorum not formed, trim_to = 0" << dendl;
2259     return 0;
2260   }
2261
2262   {
2263     std::lock_guard<std::mutex> l(creating_pgs_lock);
2264     if (!creating_pgs.pgs.empty()) {
2265       dout(10) << __func__ << " pgs creating, trim_to = 0" << dendl;
2266       return 0;
2267     }
2268   }
2269
2270   if (g_conf().get_val<bool>("mon_debug_block_osdmap_trim")) {
2271     dout(0) << __func__
2272             << " blocking osdmap trim"
2273             << " ('mon_debug_block_osdmap_trim' set to 'true')"
2274             << " trim_to = 0" << dendl;
2275     return 0;
2276   }
2277
2278   {
2279     epoch_t floor = get_min_last_epoch_clean();
2280     dout(10) << " min_last_epoch_clean " << floor << dendl;
2281     if (g_conf()->mon_osd_force_trim_to > 0 &&
2282         g_conf()->mon_osd_force_trim_to < (int)get_last_committed()) {
2283       floor = g_conf()->mon_osd_force_trim_to;
2284       dout(10) << __func__
2285                << " explicit mon_osd_force_trim_to = " << floor << dendl;
2286     }
2287     unsigned min = g_conf()->mon_min_osdmap_epochs;
2288     if (floor + min > get_last_committed()) {
2289       if (min < get_last_committed())
2290         floor = get_last_committed() - min;
2291       else
2292         floor = 0;
2293     }
2294     if (floor > get_first_committed()) {
2295       dout(10) << __func__ << " trim_to = " << floor << dendl;
2296       return floor;
2297     }
2298   }
2299   dout(10) << __func__ << " trim_to = 0" << dendl;
2300   return 0;
2301 }
2302
2303 epoch_t OSDMonitor::get_min_last_epoch_clean() const
2304 {
2305   auto floor = last_epoch_clean.get_lower_bound(osdmap);
2306   // also scan osd epochs
2307   // don't trim past the oldest reported osd epoch
2308   for (auto [osd, epoch] : osd_epochs) {
2309     if (epoch < floor) {
2310       floor = epoch;
2311     }
2312   }
2313   return floor;
2314 }
2315
2316 void OSDMonitor::encode_trim_extra(MonitorDBStore::TransactionRef tx,
2317                                    version_t first)
2318 {
2319   dout(10) << __func__ << " including full map for e " << first << dendl;
2320   bufferlist bl;
2321   get_version_full(first, bl);
2322   put_version_full(tx, first, bl);
2323
2324   if (has_osdmap_manifest &&
2325       first > osdmap_manifest.get_first_pinned()) {
2326     _prune_update_trimmed(tx, first);
2327   }
2328 }
2329
2330
2331 /* full osdmap prune
2332  *
2333  * for more information, please refer to doc/dev/mon-osdmap-prune.rst
2334  */
2335
2336 void OSDMonitor::load_osdmap_manifest()
2337 {
2338   bool store_has_manifest =
2339     mon.store->exists(get_service_name(), "osdmap_manifest");
2340
2341   if (!store_has_manifest) {
2342     if (!has_osdmap_manifest) {
2343       return;
2344     }
2345
2346     dout(20) << __func__
2347              << " dropping osdmap manifest from memory." << dendl;
2348     osdmap_manifest = osdmap_manifest_t();
2349     has_osdmap_manifest = false;
2350     return;
2351   }
2352
2353   dout(20) << __func__
2354            << " osdmap manifest detected in store; reload." << dendl;
2355
2356   bufferlist manifest_bl;
2357   int r = get_value("osdmap_manifest", manifest_bl);
2358   if (r < 0) {
2359     derr << __func__ << " unable to read osdmap version manifest" << dendl;
2360     ceph_abort_msg("error reading manifest");
2361   }
2362   osdmap_manifest.decode(manifest_bl);
2363   has_osdmap_manifest = true;
2364
2365   dout(10) << __func__ << " store osdmap manifest pinned ("
2366            << osdmap_manifest.get_first_pinned()
2367            << " .. "
2368            << osdmap_manifest.get_last_pinned()
2369            << ")"
2370            << dendl;
2371 }
2372
2373 bool OSDMonitor::should_prune() const
2374 {
2375   version_t first = get_first_committed();
2376   version_t last = get_last_committed();
2377   version_t min_osdmap_epochs =
2378     g_conf().get_val<int64_t>("mon_min_osdmap_epochs");
2379   version_t prune_min =
2380     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2381   version_t prune_interval =
2382     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2383   version_t last_pinned = osdmap_manifest.get_last_pinned();
2384   version_t last_to_pin = last - min_osdmap_epochs;
2385
2386   // Make it or break it constraints.
2387   //
2388   // If any of these conditions fails, we will not prune, regardless of
2389   // whether we have an on-disk manifest with an on-going pruning state.
2390   //
2391   if ((last - first) <= min_osdmap_epochs) {
2392     // between the first and last committed epochs, we don't have
2393     // enough epochs to trim, much less to prune.
2394     dout(10) << __func__
2395              << " currently holding only " << (last - first)
2396              << " epochs (min osdmap epochs: " << min_osdmap_epochs
2397              << "); do not prune."
2398              << dendl;
2399     return false;
2400
2401   } else if ((last_to_pin - first) < prune_min) {
2402     // between the first committed epoch and the last epoch we would prune,
2403     // we simply don't have enough versions over the minimum to prune maps.
2404     dout(10) << __func__
2405              << " could only prune " << (last_to_pin - first)
2406              << " epochs (" << first << ".." << last_to_pin << "), which"
2407                 " is less than the required minimum (" << prune_min << ")"
2408              << dendl;
2409     return false;
2410
2411   } else if (has_osdmap_manifest && last_pinned >= last_to_pin) {
2412     dout(10) << __func__
2413              << " we have pruned as far as we can; do not prune."
2414              << dendl;
2415     return false;
2416
2417   } else if (last_pinned + prune_interval > last_to_pin) {
2418     dout(10) << __func__
2419              << " not enough epochs to form an interval (last pinned: "
2420              << last_pinned << ", last to pin: "
2421              << last_to_pin << ", interval: " << prune_interval << ")"
2422              << dendl;
2423     return false;
2424   }
2425
2426   dout(15) << __func__
2427            << " should prune (" << last_pinned << ".." << last_to_pin << ")"
2428            << " lc (" << first << ".." << last << ")"
2429            << dendl;
2430   return true;
2431 }
2432
2433 void OSDMonitor::_prune_update_trimmed(
2434     MonitorDBStore::TransactionRef tx,
2435     version_t first)
2436 {
2437   dout(10) << __func__
2438            << " first " << first
2439            << " last_pinned " << osdmap_manifest.get_last_pinned()
2440            << dendl;
2441
2442   osdmap_manifest_t manifest = osdmap_manifest;
2443
2444   if (!manifest.is_pinned(first)) {
2445     manifest.pin(first);
2446   }
2447
2448   set<version_t>::iterator p_end = manifest.pinned.find(first);
2449   set<version_t>::iterator p = manifest.pinned.begin();
2450   manifest.pinned.erase(p, p_end);
2451   ceph_assert(manifest.get_first_pinned() == first);
2452
2453   if (manifest.get_last_pinned() == first+1 ||
2454       manifest.pinned.size() == 1) {
2455     // we reached the end of the line, as pinned maps go; clean up our
2456     // manifest, and let `should_prune()` decide whether we should prune
2457     // again.
2458     tx->erase(get_service_name(), "osdmap_manifest");
2459     return;
2460   }
2461
2462   bufferlist bl;
2463   manifest.encode(bl);
2464   tx->put(get_service_name(), "osdmap_manifest", bl);
2465 }
2466
2467 void OSDMonitor::prune_init(osdmap_manifest_t& manifest)
2468 {
2469   dout(1) << __func__ << dendl;
2470
2471   version_t pin_first;
2472
2473   // verify constrainsts on stable in-memory state
2474   if (!has_osdmap_manifest) {
2475     // we must have never pruned, OR if we pruned the state must no longer
2476     // be relevant (i.e., the state must have been removed alongside with
2477     // the trim that *must* have removed past the last pinned map in a
2478     // previous prune).
2479     ceph_assert(osdmap_manifest.pinned.empty());
2480     ceph_assert(!mon.store->exists(get_service_name(), "osdmap_manifest"));
2481     pin_first = get_first_committed();
2482
2483   } else {
2484     // we must have pruned in the past AND its state is still relevant
2485     // (i.e., even if we trimmed, we still hold pinned maps in the manifest,
2486     // and thus we still hold a manifest in the store).
2487     ceph_assert(!osdmap_manifest.pinned.empty());
2488     ceph_assert(osdmap_manifest.get_first_pinned() == get_first_committed());
2489     ceph_assert(osdmap_manifest.get_last_pinned() < get_last_committed());
2490
2491     dout(10) << __func__
2492              << " first_pinned " << osdmap_manifest.get_first_pinned()
2493              << " last_pinned " << osdmap_manifest.get_last_pinned()
2494              << dendl;
2495
2496     pin_first = osdmap_manifest.get_last_pinned();
2497   }
2498
2499   manifest.pin(pin_first);
2500 }
2501
2502 bool OSDMonitor::_prune_sanitize_options() const
2503 {
2504   uint64_t prune_interval =
2505     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2506   uint64_t prune_min =
2507     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_min");
2508   uint64_t txsize =
2509     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2510
2511   bool r = true;
2512
2513   if (prune_interval == 0) {
2514     derr << __func__
2515          << " prune is enabled BUT prune interval is zero; abort."
2516          << dendl;
2517     r = false;
2518   } else if (prune_interval == 1) {
2519     derr << __func__
2520          << " prune interval is equal to one, which essentially means"
2521             " no pruning; abort."
2522          << dendl;
2523     r = false;
2524   }
2525   if (prune_min == 0) {
2526     derr << __func__
2527          << " prune is enabled BUT prune min is zero; abort."
2528          << dendl;
2529     r = false;
2530   }
2531   if (prune_interval > prune_min) {
2532     derr << __func__
2533          << " impossible to ascertain proper prune interval because"
2534          << " it is greater than the minimum prune epochs"
2535          << " (min: " << prune_min << ", interval: " << prune_interval << ")"
2536          << dendl;
2537     r = false;
2538   }
2539
2540   if (txsize < prune_interval - 1) {
2541     derr << __func__
2542          << " 'mon_osdmap_full_prune_txsize' (" << txsize
2543          << ") < 'mon_osdmap_full_prune_interval-1' (" << prune_interval - 1
2544          << "); abort." << dendl;
2545     r = false;
2546   }
2547   return r;
2548 }
2549
2550 bool OSDMonitor::is_prune_enabled() const {
2551   return g_conf().get_val<bool>("mon_osdmap_full_prune_enabled");
2552 }
2553
2554 bool OSDMonitor::is_prune_supported() const {
2555   return mon.get_required_mon_features().contains_any(
2556       ceph::features::mon::FEATURE_OSDMAP_PRUNE);
2557 }
2558
2559 /** do_prune
2560  *
2561  * @returns true if has side-effects; false otherwise.
2562  */
2563 bool OSDMonitor::do_prune(MonitorDBStore::TransactionRef tx)
2564 {
2565   bool enabled = is_prune_enabled();
2566
2567   dout(1) << __func__ << " osdmap full prune "
2568           << ( enabled ? "enabled" : "disabled")
2569           << dendl;
2570
2571   if (!enabled || !_prune_sanitize_options() || !should_prune()) {
2572     return false;
2573   }
2574
2575   // we are beyond the minimum prune versions, we need to remove maps because
2576   // otherwise the store will grow unbounded and we may end up having issues
2577   // with available disk space or store hangs.
2578
2579   // we will not pin all versions. We will leave a buffer number of versions.
2580   // this allows us the monitor to trim maps without caring too much about
2581   // pinned maps, and then allow us to use another ceph-mon without these
2582   // capabilities, without having to repair the store.
2583
2584   osdmap_manifest_t manifest = osdmap_manifest;
2585
2586   version_t first = get_first_committed();
2587   version_t last = get_last_committed();
2588
2589   version_t last_to_pin = last - g_conf()->mon_min_osdmap_epochs;
2590   version_t last_pinned = manifest.get_last_pinned();
2591   uint64_t prune_interval =
2592     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_interval");
2593   uint64_t txsize =
2594     g_conf().get_val<uint64_t>("mon_osdmap_full_prune_txsize");
2595
2596   prune_init(manifest);
2597
2598   // we need to get rid of some osdmaps
2599
2600   dout(5) << __func__
2601           << " lc (" << first << " .. " << last << ")"
2602           << " last_pinned " << last_pinned
2603           << " interval " << prune_interval
2604           << " last_to_pin " << last_to_pin
2605           << dendl;
2606
2607   // We will be erasing maps as we go.
2608   //
2609   // We will erase all maps between `last_pinned` and the `next_to_pin`.
2610   //
2611   // If `next_to_pin` happens to be greater than `last_to_pin`, then
2612   // we stop pruning. We could prune the maps between `next_to_pin` and
2613   // `last_to_pin`, but by not doing it we end up with neater pruned
2614   // intervals, aligned with `prune_interval`. Besides, this should not be a
2615   // problem as long as `prune_interval` is set to a sane value, instead of
2616   // hundreds or thousands of maps.
2617
2618   auto map_exists = [this](version_t v) {
2619     string k = mon.store->combine_strings("full", v);
2620     return mon.store->exists(get_service_name(), k);
2621   };
2622
2623   // 'interval' represents the number of maps from the last pinned
2624   // i.e., if we pinned version 1 and have an interval of 10, we're pinning
2625   // version 11 next; all intermediate versions will be removed.
2626   //
2627   // 'txsize' represents the maximum number of versions we'll be removing in
2628   // this iteration. If 'txsize' is large enough to perform multiple passes
2629   // pinning and removing maps, we will do so; if not, we'll do at least one
2630   // pass. We are quite relaxed about honouring 'txsize', but we'll always
2631   // ensure that we never go *over* the maximum.
2632
2633   // e.g., if we pin 1 and 11, we're removing versions [2..10]; i.e., 9 maps.
2634   uint64_t removal_interval = prune_interval - 1;
2635
2636   if (txsize < removal_interval) {
2637     dout(5) << __func__
2638             << " setting txsize to removal interval size ("
2639             << removal_interval << " versions"
2640             << dendl;
2641     txsize = removal_interval;
2642   }
2643   ceph_assert(removal_interval > 0);
2644
2645   uint64_t num_pruned = 0;
2646   while (num_pruned + removal_interval <= txsize) {
2647     last_pinned = manifest.get_last_pinned();
2648
2649     if (last_pinned + prune_interval > last_to_pin) {
2650       break;
2651     }
2652     ceph_assert(last_pinned < last_to_pin);
2653
2654     version_t next_pinned = last_pinned + prune_interval;
2655     ceph_assert(next_pinned <= last_to_pin);
2656     manifest.pin(next_pinned);
2657
2658     dout(20) << __func__
2659              << " last_pinned " << last_pinned
2660              << " next_pinned " << next_pinned
2661              << " num_pruned " << num_pruned
2662              << " removal interval (" << (last_pinned+1)
2663              << ".." << (next_pinned-1) << ")"
2664              << " txsize " << txsize << dendl;
2665
2666     ceph_assert(map_exists(last_pinned));
2667     ceph_assert(map_exists(next_pinned));
2668
2669     for (version_t v = last_pinned+1; v < next_pinned; ++v) {
2670       ceph_assert(!manifest.is_pinned(v));
2671
2672       dout(20) << __func__ << "   pruning full osdmap e" << v << dendl;
2673       string full_key = mon.store->combine_strings("full", v);
2674       tx->erase(get_service_name(), full_key);
2675       ++num_pruned;
2676     }
2677   }
2678
2679   ceph_assert(num_pruned > 0);
2680
2681   bufferlist bl;
2682   manifest.encode(bl);
2683   tx->put(get_service_name(), "osdmap_manifest", bl);
2684
2685   return true;
2686 }
2687
2688
2689 // -------------
2690
2691 bool OSDMonitor::preprocess_query(MonOpRequestRef op)
2692 {
2693   op->mark_osdmon_event(__func__);
2694   Message *m = op->get_req();
2695   dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
2696
2697   switch (m->get_type()) {
2698     // READs
2699   case MSG_MON_COMMAND:
2700     try {
2701       return preprocess_command(op);
2702     } catch (const bad_cmd_get& e) {
2703       bufferlist bl;
2704       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2705       return true;
2706     }
2707   case CEPH_MSG_MON_GET_OSDMAP:
2708     return preprocess_get_osdmap(op);
2709
2710     // damp updates
2711   case MSG_OSD_MARK_ME_DOWN:
2712     return preprocess_mark_me_down(op);
2713   case MSG_OSD_MARK_ME_DEAD:
2714     return preprocess_mark_me_dead(op);
2715   case MSG_OSD_FULL:
2716     return preprocess_full(op);
2717   case MSG_OSD_FAILURE:
2718     return preprocess_failure(op);
2719   case MSG_OSD_BOOT:
2720     return preprocess_boot(op);
2721   case MSG_OSD_ALIVE:
2722     return preprocess_alive(op);
2723   case MSG_OSD_PG_CREATED:
2724     return preprocess_pg_created(op);
2725   case MSG_OSD_PG_READY_TO_MERGE:
2726     return preprocess_pg_ready_to_merge(op);
2727   case MSG_OSD_PGTEMP:
2728     return preprocess_pgtemp(op);
2729   case MSG_OSD_BEACON:
2730     return preprocess_beacon(op);
2731
2732   case CEPH_MSG_POOLOP:
2733     return preprocess_pool_op(op);
2734
2735   case MSG_REMOVE_SNAPS:
2736     return preprocess_remove_snaps(op);
2737
2738   case MSG_MON_GET_PURGED_SNAPS:
2739     return preprocess_get_purged_snaps(op);
2740
2741   default:
2742     ceph_abort();
2743     return true;
2744   }
2745 }
2746
2747 bool OSDMonitor::prepare_update(MonOpRequestRef op)
2748 {
2749   op->mark_osdmon_event(__func__);
2750   Message *m = op->get_req();
2751   dout(7) << "prepare_update " << *m << " from " << m->get_orig_source_inst() << dendl;
2752
2753   switch (m->get_type()) {
2754     // damp updates
2755   case MSG_OSD_MARK_ME_DOWN:
2756     return prepare_mark_me_down(op);
2757   case MSG_OSD_MARK_ME_DEAD:
2758     return prepare_mark_me_dead(op);
2759   case MSG_OSD_FULL:
2760     return prepare_full(op);
2761   case MSG_OSD_FAILURE:
2762     return prepare_failure(op);
2763   case MSG_OSD_BOOT:
2764     return prepare_boot(op);
2765   case MSG_OSD_ALIVE:
2766     return prepare_alive(op);
2767   case MSG_OSD_PG_CREATED:
2768     return prepare_pg_created(op);
2769   case MSG_OSD_PGTEMP:
2770     return prepare_pgtemp(op);
2771   case MSG_OSD_PG_READY_TO_MERGE:
2772     return prepare_pg_ready_to_merge(op);
2773   case MSG_OSD_BEACON:
2774     return prepare_beacon(op);
2775
2776   case MSG_MON_COMMAND:
2777     try {
2778       return prepare_command(op);
2779     } catch (const bad_cmd_get& e) {
2780       bufferlist bl;
2781       mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
2782       return true;
2783     }
2784
2785   case CEPH_MSG_POOLOP:
2786     return prepare_pool_op(op);
2787
2788   case MSG_REMOVE_SNAPS:
2789     return prepare_remove_snaps(op);
2790
2791
2792   default:
2793     ceph_abort();
2794   }
2795
2796   return false;
2797 }
2798
2799 bool OSDMonitor::should_propose(double& delay)
2800 {
2801   dout(10) << "should_propose" << dendl;
2802
2803   // if full map, propose immediately!  any subsequent changes will be clobbered.
2804   if (pending_inc.fullmap.length())
2805     return true;
2806
2807   // adjust osd weights?
2808   if (!osd_weight.empty() &&
2809       osd_weight.size() == (unsigned)osdmap.get_max_osd()) {
2810     dout(0) << " adjusting osd weights based on " << osd_weight << dendl;
2811     osdmap.adjust_osd_weights(osd_weight, pending_inc);
2812     delay = 0.0;
2813     osd_weight.clear();
2814     return true;
2815   }
2816
2817   return PaxosService::should_propose(delay);
2818 }
2819
2820
2821
2822 // ---------------------------
2823 // READs
2824
2825 bool OSDMonitor::preprocess_get_osdmap(MonOpRequestRef op)
2826 {
2827   op->mark_osdmon_event(__func__);
2828   auto m = op->get_req<MMonGetOSDMap>();
2829
2830   uint64_t features = mon.get_quorum_con_features();
2831   if (op->get_session() && op->get_session()->con_features)
2832     features = op->get_session()->con_features;
2833
2834   dout(10) << __func__ << " " << *m << dendl;
2835   MOSDMap *reply = new MOSDMap(mon.monmap->fsid, features);
2836   epoch_t first = get_first_committed();
2837   epoch_t last = osdmap.get_epoch();
2838   int max = g_conf()->osd_map_message_max;
2839   ssize_t max_bytes = g_conf()->osd_map_message_max_bytes;
2840   for (epoch_t e = std::max(first, m->get_full_first());
2841        e <= std::min(last, m->get_full_last()) && max > 0 && max_bytes > 0;
2842        ++e, --max) {
2843     bufferlist& bl = reply->maps[e];
2844     int r = get_version_full(e, features, bl);
2845     ceph_assert(r >= 0);
2846     max_bytes -= bl.length();
2847   }
2848   for (epoch_t e = std::max(first, m->get_inc_first());
2849        e <= std::min(last, m->get_inc_last()) && max > 0 && max_bytes > 0;
2850        ++e, --max) {
2851     bufferlist& bl = reply->incremental_maps[e];
2852     int r = get_version(e, features, bl);
2853     ceph_assert(r >= 0);
2854     max_bytes -= bl.length();
2855   }
2856   reply->oldest_map = first;
2857   reply->newest_map = last;
2858   mon.send_reply(op, reply);
2859   return true;
2860 }
2861
2862
2863 // ---------------------------
2864 // UPDATEs
2865
2866 // failure --
2867
2868 bool OSDMonitor::check_source(MonOpRequestRef op, uuid_d fsid) {
2869   // check permissions
2870   MonSession *session = op->get_session();
2871   if (!session)
2872     return true;
2873   if (!session->is_capable("osd", MON_CAP_X)) {
2874     dout(0) << "got MOSDFailure from entity with insufficient caps "
2875             << session->caps << dendl;
2876     return true;
2877   }
2878   if (fsid != mon.monmap->fsid) {
2879     dout(0) << "check_source: on fsid " << fsid
2880             << " != " << mon.monmap->fsid << dendl;
2881     return true;
2882   }
2883   return false;
2884 }
2885
2886
2887 bool OSDMonitor::preprocess_failure(MonOpRequestRef op)
2888 {
2889   op->mark_osdmon_event(__func__);
2890   auto m = op->get_req<MOSDFailure>();
2891   // who is target_osd
2892   int badboy = m->get_target_osd();
2893
2894   // check permissions
2895   if (check_source(op, m->fsid))
2896     goto didit;
2897
2898   // first, verify the reporting host is valid
2899   if (m->get_orig_source().is_osd()) {
2900     int from = m->get_orig_source().num();
2901     if (!osdmap.exists(from) ||
2902         !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) ||
2903         (osdmap.is_down(from) && m->if_osd_failed())) {
2904       dout(5) << "preprocess_failure from dead osd." << from
2905               << ", ignoring" << dendl;
2906       send_incremental(op, m->get_epoch()+1);
2907       goto didit;
2908     }
2909   }
2910
2911
2912   // weird?
2913   if (osdmap.is_down(badboy)) {
2914     dout(5) << "preprocess_failure dne(/dup?): osd." << m->get_target_osd()
2915             << " " << m->get_target_addrs()
2916             << ", from " << m->get_orig_source() << dendl;
2917     if (m->get_epoch() < osdmap.get_epoch())
2918       send_incremental(op, m->get_epoch()+1);
2919     goto didit;
2920   }
2921   if (osdmap.get_addrs(badboy) != m->get_target_addrs()) {
2922     dout(5) << "preprocess_failure wrong osd: report osd." << m->get_target_osd()
2923             << " " << m->get_target_addrs()
2924             << " != map's " << osdmap.get_addrs(badboy)
2925             << ", from " << m->get_orig_source() << dendl;
2926     if (m->get_epoch() < osdmap.get_epoch())
2927       send_incremental(op, m->get_epoch()+1);
2928     goto didit;
2929   }
2930
2931   // already reported?
2932   if (osdmap.is_down(badboy) ||
2933       osdmap.get_up_from(badboy) > m->get_epoch()) {
2934     dout(5) << "preprocess_failure dup/old: osd." << m->get_target_osd()
2935             << " " << m->get_target_addrs()
2936             << ", from " << m->get_orig_source() << dendl;
2937     if (m->get_epoch() < osdmap.get_epoch())
2938       send_incremental(op, m->get_epoch()+1);
2939     goto didit;
2940   }
2941
2942   if (!can_mark_down(badboy)) {
2943     dout(5) << "preprocess_failure ignoring report of osd."
2944             << m->get_target_osd() << " " << m->get_target_addrs()
2945             << " from " << m->get_orig_source() << dendl;
2946     goto didit;
2947   }
2948
2949   dout(10) << "preprocess_failure new: osd." << m->get_target_osd()
2950            << " " << m->get_target_addrs()
2951            << ", from " << m->get_orig_source() << dendl;
2952   return false;
2953
2954  didit:
2955   mon.no_reply(op);
2956   return true;
2957 }
2958
2959 class C_AckMarkedDown : public C_MonOp {
2960   OSDMonitor *osdmon;
2961 public:
2962   C_AckMarkedDown(
2963     OSDMonitor *osdmon,
2964     MonOpRequestRef op)
2965     : C_MonOp(op), osdmon(osdmon) {}
2966
2967   void _finish(int r) override {
2968     if (r == 0) {
2969       auto m = op->get_req<MOSDMarkMeDown>();
2970       osdmon->mon.send_reply(
2971         op,
2972         new MOSDMarkMeDown(
2973           m->fsid,
2974           m->target_osd,
2975           m->target_addrs,
2976           m->get_epoch(),
2977           false));   // ACK itself does not request an ack
2978     } else if (r == -EAGAIN) {
2979         osdmon->dispatch(op);
2980     } else {
2981         ceph_abort_msgf("C_AckMarkedDown: unknown result %d", r);
2982     }
2983   }
2984   ~C_AckMarkedDown() override {
2985   }
2986 };
2987
2988 bool OSDMonitor::preprocess_mark_me_down(MonOpRequestRef op)
2989 {
2990   op->mark_osdmon_event(__func__);
2991   auto m = op->get_req<MOSDMarkMeDown>();
2992   int from = m->target_osd;
2993
2994   // check permissions
2995   if (check_source(op, m->fsid))
2996     goto reply;
2997
2998   // first, verify the reporting host is valid
2999   if (!m->get_orig_source().is_osd())
3000     goto reply;
3001
3002   if (!osdmap.exists(from) ||
3003       osdmap.is_down(from) ||
3004       osdmap.get_addrs(from) != m->target_addrs) {
3005     dout(5) << "preprocess_mark_me_down from dead osd."
3006             << from << ", ignoring" << dendl;
3007     send_incremental(op, m->get_epoch()+1);
3008     goto reply;
3009   }
3010
3011   // no down might be set
3012   if (!can_mark_down(from))
3013     goto reply;
3014
3015   dout(10) << "MOSDMarkMeDown for: " << m->get_orig_source()
3016            << " " << m->target_addrs << dendl;
3017   return false;
3018
3019  reply:
3020   if (m->request_ack) {
3021     Context *c(new C_AckMarkedDown(this, op));
3022     c->complete(0);
3023   }
3024   return true;
3025 }
3026
3027 bool OSDMonitor::prepare_mark_me_down(MonOpRequestRef op)
3028 {
3029   op->mark_osdmon_event(__func__);
3030   auto m = op->get_req<MOSDMarkMeDown>();
3031   int target_osd = m->target_osd;
3032
3033   ceph_assert(osdmap.is_up(target_osd));
3034   ceph_assert(osdmap.get_addrs(target_osd) == m->target_addrs);
3035
3036   mon.clog->info() << "osd." << target_osd << " marked itself down";
3037   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3038   if (m->request_ack)
3039     wait_for_finished_proposal(op, new C_AckMarkedDown(this, op));
3040   return true;
3041 }
3042
3043 bool OSDMonitor::preprocess_mark_me_dead(MonOpRequestRef op)
3044 {
3045   op->mark_osdmon_event(__func__);
3046   auto m = op->get_req<MOSDMarkMeDead>();
3047   int from = m->target_osd;
3048
3049   // check permissions
3050   if (check_source(op, m->fsid)) {
3051     mon.no_reply(op);
3052     return true;
3053   }
3054
3055   // first, verify the reporting host is valid
3056   if (!m->get_orig_source().is_osd()) {
3057     mon.no_reply(op);
3058     return true;
3059   }
3060
3061   if (!osdmap.exists(from) ||
3062       !osdmap.is_down(from)) {
3063     dout(5) << __func__ << " from nonexistent or up osd." << from
3064             << ", ignoring" << dendl;
3065     send_incremental(op, m->get_epoch()+1);
3066     mon.no_reply(op);
3067     return true;
3068   }
3069
3070   return false;
3071 }
3072
3073 bool OSDMonitor::prepare_mark_me_dead(MonOpRequestRef op)
3074 {
3075   op->mark_osdmon_event(__func__);
3076   auto m = op->get_req<MOSDMarkMeDead>();
3077   int target_osd = m->target_osd;
3078
3079   ceph_assert(osdmap.is_down(target_osd));
3080
3081   mon.clog->info() << "osd." << target_osd << " marked itself dead as of e"
3082                     << m->get_epoch();
3083   if (!pending_inc.new_xinfo.count(target_osd)) {
3084     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3085   }
3086   pending_inc.new_xinfo[target_osd].dead_epoch = m->get_epoch();
3087   wait_for_finished_proposal(
3088     op,
3089     new LambdaContext(
3090       [op, this] (int r) {
3091         if (r >= 0) {
3092           mon.no_reply(op);       // ignore on success
3093         }
3094       }
3095       ));
3096   return true;
3097 }
3098
3099 bool OSDMonitor::can_mark_down(int i)
3100 {
3101   if (osdmap.is_nodown(i)) {
3102     dout(5) << __func__ << " osd." << i << " is marked as nodown, "
3103             << "will not mark it down" << dendl;
3104     return false;
3105   }
3106
3107   int num_osds = osdmap.get_num_osds();
3108   if (num_osds == 0) {
3109     dout(5) << __func__ << " no osds" << dendl;
3110     return false;
3111   }
3112   int up = osdmap.get_num_up_osds() - pending_inc.get_net_marked_down(&osdmap);
3113   float up_ratio = (float)up / (float)num_osds;
3114   if (up_ratio < g_conf()->mon_osd_min_up_ratio) {
3115     dout(2) << __func__ << " current up_ratio " << up_ratio << " < min "
3116             << g_conf()->mon_osd_min_up_ratio
3117             << ", will not mark osd." << i << " down" << dendl;
3118     return false;
3119   }
3120   return true;
3121 }
3122
3123 bool OSDMonitor::can_mark_up(int i)
3124 {
3125   if (osdmap.is_noup(i)) {
3126     dout(5) << __func__ << " osd." << i << " is marked as noup, "
3127             << "will not mark it up" << dendl;
3128     return false;
3129   }
3130
3131   return true;
3132 }
3133
3134 /**
3135  * @note the parameter @p i apparently only exists here so we can output the
3136  *       osd's id on messages.
3137  */
3138 bool OSDMonitor::can_mark_out(int i)
3139 {
3140   if (osdmap.is_noout(i)) {
3141     dout(5) << __func__ << " osd." << i << " is marked as noout, "
3142             << "will not mark it out" << dendl;
3143     return false;
3144   }
3145
3146   int num_osds = osdmap.get_num_osds();
3147   if (num_osds == 0) {
3148     dout(5) << __func__ << " no osds" << dendl;
3149     return false;
3150   }
3151   int in = osdmap.get_num_in_osds() - pending_inc.get_net_marked_out(&osdmap);
3152   float in_ratio = (float)in / (float)num_osds;
3153   if (in_ratio < g_conf()->mon_osd_min_in_ratio) {
3154     if (i >= 0)
3155       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3156               << g_conf()->mon_osd_min_in_ratio
3157               << ", will not mark osd." << i << " out" << dendl;
3158     else
3159       dout(5) << __func__ << " current in_ratio " << in_ratio << " < min "
3160               << g_conf()->mon_osd_min_in_ratio
3161               << ", will not mark osds out" << dendl;
3162     return false;
3163   }
3164
3165   return true;
3166 }
3167
3168 bool OSDMonitor::can_mark_in(int i)
3169 {
3170   if (osdmap.is_noin(i)) {
3171     dout(5) << __func__ << " osd." << i << " is marked as noin, "
3172             << "will not mark it in" << dendl;
3173     return false;
3174   }
3175
3176   return true;
3177 }
3178
3179 bool OSDMonitor::check_failures(utime_t now)
3180 {
3181   bool found_failure = false;
3182   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3183        p != failure_info.end();
3184        ++p) {
3185     if (can_mark_down(p->first)) {
3186       found_failure |= check_failure(now, p->first, p->second);
3187     }
3188   }
3189   return found_failure;
3190 }
3191
3192 bool OSDMonitor::check_failure(utime_t now, int target_osd, failure_info_t& fi)
3193 {
3194   // already pending failure?
3195   if (pending_inc.new_state.count(target_osd) &&
3196       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3197     dout(10) << " already pending failure" << dendl;
3198     return true;
3199   }
3200
3201   set<string> reporters_by_subtree;
3202   auto reporter_subtree_level = g_conf().get_val<string>("mon_osd_reporter_subtree_level");
3203   utime_t orig_grace(g_conf()->osd_heartbeat_grace, 0);
3204   utime_t max_failed_since = fi.get_failed_since();
3205   utime_t failed_for = now - max_failed_since;
3206
3207   utime_t grace = orig_grace;
3208   double my_grace = 0, peer_grace = 0;
3209   double decay_k = 0;
3210   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3211     double halflife = (double)g_conf()->mon_osd_laggy_halflife;
3212     decay_k = ::log(.5) / halflife;
3213
3214     // scale grace period based on historical probability of 'lagginess'
3215     // (false positive failures due to slowness).
3216     const osd_xinfo_t& xi = osdmap.get_xinfo(target_osd);
3217     double decay = exp((double)failed_for * decay_k);
3218     dout(20) << " halflife " << halflife << " decay_k " << decay_k
3219              << " failed_for " << failed_for << " decay " << decay << dendl;
3220     my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
3221     grace += my_grace;
3222   }
3223
3224   // consider the peers reporting a failure a proxy for a potential
3225   // 'subcluster' over the overall cluster that is similarly
3226   // laggy.  this is clearly not true in all cases, but will sometimes
3227   // help us localize the grace correction to a subset of the system
3228   // (say, a rack with a bad switch) that is unhappy.
3229   ceph_assert(fi.reporters.size());
3230   for (auto p = fi.reporters.begin(); p != fi.reporters.end();) {
3231     // get the parent bucket whose type matches with "reporter_subtree_level".
3232     // fall back to OSD if the level doesn't exist.
3233     if (osdmap.exists(p->first)) {
3234       auto reporter_loc = osdmap.crush->get_full_location(p->first);
3235       if (auto iter = reporter_loc.find(reporter_subtree_level);
3236           iter == reporter_loc.end()) {
3237         reporters_by_subtree.insert("osd." + to_string(p->first));
3238       } else {
3239         reporters_by_subtree.insert(iter->second);
3240       }
3241       if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3242         const osd_xinfo_t& xi = osdmap.get_xinfo(p->first);
3243         utime_t elapsed = now - xi.down_stamp;
3244         double decay = exp((double)elapsed * decay_k);
3245         peer_grace += decay * (double)xi.laggy_interval * xi.laggy_probability;
3246       }
3247       ++p;
3248     } else {
3249       fi.cancel_report(p->first);;
3250       p = fi.reporters.erase(p);
3251     }
3252   }
3253
3254   if (g_conf()->mon_osd_adjust_heartbeat_grace) {
3255     peer_grace /= (double)fi.reporters.size();
3256     grace += peer_grace;
3257   }
3258
3259   dout(10) << " osd." << target_osd << " has "
3260            << fi.reporters.size() << " reporters, "
3261            << grace << " grace (" << orig_grace << " + " << my_grace
3262            << " + " << peer_grace << "), max_failed_since " << max_failed_since
3263            << dendl;
3264
3265   if (failed_for >= grace &&
3266       reporters_by_subtree.size() >= g_conf().get_val<uint64_t>("mon_osd_min_down_reporters")) {
3267     dout(1) << " we have enough reporters to mark osd." << target_osd
3268             << " down" << dendl;
3269     pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3270
3271     mon.clog->info() << "osd." << target_osd << " failed ("
3272                       << osdmap.crush->get_full_location_ordered_string(
3273                         target_osd)
3274                       << ") ("
3275                       << (int)reporters_by_subtree.size()
3276                       << " reporters from different "
3277                       << reporter_subtree_level << " after "
3278                       << failed_for << " >= grace " << grace << ")";
3279     return true;
3280   }
3281   return false;
3282 }
3283
3284 void OSDMonitor::force_failure(int target_osd, int by)
3285 {
3286   // already pending failure?
3287   if (pending_inc.new_state.count(target_osd) &&
3288       pending_inc.new_state[target_osd] & CEPH_OSD_UP) {
3289     dout(10) << " already pending failure" << dendl;
3290     return;
3291   }
3292
3293   dout(1) << " we're forcing failure of osd." << target_osd << dendl;
3294   pending_inc.new_state[target_osd] = CEPH_OSD_UP;
3295   if (!pending_inc.new_xinfo.count(target_osd)) {
3296     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3297   }
3298   pending_inc.new_xinfo[target_osd].dead_epoch = pending_inc.epoch;
3299
3300   mon.clog->info() << "osd." << target_osd << " failed ("
3301                     << osdmap.crush->get_full_location_ordered_string(target_osd)
3302                     << ") (connection refused reported by osd." << by << ")";
3303   return;
3304 }
3305
3306 bool OSDMonitor::prepare_failure(MonOpRequestRef op)
3307 {
3308   op->mark_osdmon_event(__func__);
3309   auto m = op->get_req<MOSDFailure>();
3310   dout(1) << "prepare_failure osd." << m->get_target_osd()
3311           << " " << m->get_target_addrs()
3312           << " from " << m->get_orig_source()
3313           << " is reporting failure:" << m->if_osd_failed() << dendl;
3314
3315   int target_osd = m->get_target_osd();
3316   int reporter = m->get_orig_source().num();
3317   ceph_assert(osdmap.is_up(target_osd));
3318   ceph_assert(osdmap.get_addrs(target_osd) == m->get_target_addrs());
3319
3320   mon.no_reply(op);
3321
3322   if (m->if_osd_failed()) {
3323     // calculate failure time
3324     utime_t now = ceph_clock_now();
3325     utime_t failed_since =
3326       m->get_recv_stamp() - utime_t(m->failed_for, 0);
3327
3328     // add a report
3329     if (m->is_immediate()) {
3330       mon.clog->debug() << "osd." << m->get_target_osd()
3331                          << " reported immediately failed by "
3332                          << m->get_orig_source();
3333       force_failure(target_osd, reporter);
3334       return true;
3335     }
3336     mon.clog->debug() << "osd." << m->get_target_osd() << " reported failed by "
3337                       << m->get_orig_source();
3338
3339     failure_info_t& fi = failure_info[target_osd];
3340     MonOpRequestRef old_op = fi.add_report(reporter, failed_since, op);
3341     if (old_op) {
3342       mon.no_reply(old_op);
3343     }
3344
3345     return check_failure(now, target_osd, fi);
3346   } else {
3347     // remove the report
3348     mon.clog->debug() << "osd." << m->get_target_osd()
3349                        << " failure report canceled by "
3350                        << m->get_orig_source();
3351     if (failure_info.count(target_osd)) {
3352       failure_info_t& fi = failure_info[target_osd];
3353       MonOpRequestRef report_op = fi.cancel_report(reporter);
3354       if (report_op) {
3355         mon.no_reply(report_op);
3356       }
3357       if (fi.reporters.empty()) {
3358         dout(10) << " removing last failure_info for osd." << target_osd
3359                  << dendl;
3360         failure_info.erase(target_osd);
3361       } else {
3362         dout(10) << " failure_info for osd." << target_osd << " now "
3363                  << fi.reporters.size() << " reporters" << dendl;
3364       }
3365     } else {
3366       dout(10) << " no failure_info for osd." << target_osd << dendl;
3367     }
3368   }
3369
3370   return false;
3371 }
3372
3373 void OSDMonitor::process_failures()
3374 {
3375   map<int,failure_info_t>::iterator p = failure_info.begin();
3376   while (p != failure_info.end()) {
3377     if (osdmap.is_up(p->first)) {
3378       ++p;
3379     } else {
3380       dout(10) << "process_failures osd." << p->first << dendl;
3381       list<MonOpRequestRef> ls;
3382       p->second.take_report_messages(ls);
3383       failure_info.erase(p++);
3384
3385       while (!ls.empty()) {
3386         MonOpRequestRef o = ls.front();
3387         if (o) {
3388           o->mark_event(__func__);
3389           MOSDFailure *m = o->get_req<MOSDFailure>();
3390           send_latest(o, m->get_epoch());
3391           mon.no_reply(o);
3392         }
3393         ls.pop_front();
3394       }
3395     }
3396   }
3397 }
3398
3399 void OSDMonitor::take_all_failures(list<MonOpRequestRef>& ls)
3400 {
3401   dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl;
3402
3403   for (map<int,failure_info_t>::iterator p = failure_info.begin();
3404        p != failure_info.end();
3405        ++p) {
3406     p->second.take_report_messages(ls);
3407   }
3408   failure_info.clear();
3409 }
3410
3411 int OSDMonitor::get_grace_interval_threshold()
3412 {
3413   int halflife = g_conf()->mon_osd_laggy_halflife;
3414   // Scale the halflife period (default: 1_hr) by
3415   // a factor (48) to calculate the threshold.
3416   int grace_threshold_factor = 48;
3417   return halflife * grace_threshold_factor;
3418 }
3419
3420 bool OSDMonitor::grace_interval_threshold_exceeded(int last_failed_interval)
3421 {
3422   int grace_interval_threshold_secs = get_grace_interval_threshold();
3423   if (last_failed_interval > grace_interval_threshold_secs) {
3424     dout(1) << " last_failed_interval " << last_failed_interval
3425             << " > grace_interval_threshold_secs " << grace_interval_threshold_secs
3426             << dendl;
3427     return true;
3428   }
3429   return false;
3430 }
3431
3432 void OSDMonitor::set_default_laggy_params(int target_osd)
3433 {
3434   if (pending_inc.new_xinfo.count(target_osd) == 0) {
3435     pending_inc.new_xinfo[target_osd] = osdmap.osd_xinfo[target_osd];
3436   }
3437   osd_xinfo_t& xi = pending_inc.new_xinfo[target_osd];
3438   xi.down_stamp = pending_inc.modified;
3439   xi.laggy_probability = 0.0;
3440   xi.laggy_interval = 0;
3441   dout(20) << __func__ << " reset laggy, now xi " << xi << dendl;
3442 }
3443
3444
3445 // boot --
3446
3447 bool OSDMonitor::preprocess_boot(MonOpRequestRef op)
3448 {
3449   op->mark_osdmon_event(__func__);
3450   auto m = op->get_req<MOSDBoot>();
3451   int from = m->get_orig_source_inst().name.num();
3452
3453   // check permissions, ignore if failed (no response expected)
3454   MonSession *session = op->get_session();
3455   if (!session)
3456     goto ignore;
3457   if (!session->is_capable("osd", MON_CAP_X)) {
3458     dout(0) << "got preprocess_boot message from entity with insufficient caps"
3459             << session->caps << dendl;
3460     goto ignore;
3461   }
3462
3463   if (m->sb.cluster_fsid != mon.monmap->fsid) {
3464     dout(0) << "preprocess_boot on fsid " << m->sb.cluster_fsid
3465             << " != " << mon.monmap->fsid << dendl;
3466     goto ignore;
3467   }
3468
3469   if (m->get_orig_source_inst().addr.is_blank_ip()) {
3470     dout(0) << "preprocess_boot got blank addr for " << m->get_orig_source_inst() << dendl;
3471     goto ignore;
3472   }
3473
3474   ceph_assert(m->get_orig_source_inst().name.is_osd());
3475
3476   // force all osds to have gone through luminous prior to upgrade to nautilus
3477   {
3478     vector<string> missing;
3479     if (!HAVE_FEATURE(m->osd_features, SERVER_LUMINOUS)) {
3480       missing.push_back("CEPH_FEATURE_SERVER_LUMINOUS");
3481     }
3482     if (!HAVE_FEATURE(m->osd_features, SERVER_JEWEL)) {
3483       missing.push_back("CEPH_FEATURE_SERVER_JEWEL");
3484     }
3485     if (!HAVE_FEATURE(m->osd_features, SERVER_KRAKEN)) {
3486       missing.push_back("CEPH_FEATURE_SERVER_KRAKEN");
3487     }
3488     if (!HAVE_FEATURE(m->osd_features, OSD_RECOVERY_DELETES)) {
3489       missing.push_back("CEPH_FEATURE_OSD_RECOVERY_DELETES");
3490     }
3491
3492     if (!missing.empty()) {
3493       using std::experimental::make_ostream_joiner;
3494
3495       stringstream ss;
3496       copy(begin(missing), end(missing), make_ostream_joiner(ss, ";"));
3497
3498       mon.clog->info() << "disallowing boot of OSD "
3499                         << m->get_orig_source_inst()
3500                         << " because the osd lacks " << ss.str();
3501       goto ignore;
3502     }
3503   }
3504
3505   // make sure osd versions do not span more than 3 releases
3506   if (HAVE_FEATURE(m->osd_features, SERVER_OCTOPUS) &&
3507       osdmap.require_osd_release < ceph_release_t::mimic) {
3508     mon.clog->info() << "disallowing boot of octopus+ OSD "
3509                       << m->get_orig_source_inst()
3510                       << " because require_osd_release < mimic";
3511     goto ignore;
3512   }
3513   if (HAVE_FEATURE(m->osd_features, SERVER_PACIFIC) &&
3514       osdmap.require_osd_release < ceph_release_t::nautilus) {
3515     mon.clog->info() << "disallowing boot of pacific+ OSD "
3516                       << m->get_orig_source_inst()
3517                       << " because require_osd_release < nautilus";
3518     goto ignore;
3519   }
3520
3521   // The release check here is required because for OSD_PGLOG_HARDLIMIT,
3522   // we are reusing a jewel feature bit that was retired in luminous.
3523   if (osdmap.require_osd_release >= ceph_release_t::luminous &&
3524       osdmap.test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT) &&
3525       !(m->osd_features & CEPH_FEATURE_OSD_PGLOG_HARDLIMIT)) {
3526     mon.clog->info() << "disallowing boot of OSD "
3527                       << m->get_orig_source_inst()
3528                       << " because 'pglog_hardlimit' osdmap flag is set and OSD lacks the OSD_PGLOG_HARDLIMIT feature";
3529     goto ignore;
3530   }
3531
3532   if (osdmap.stretch_mode_enabled &&
3533       !(m->osd_features & CEPH_FEATUREMASK_STRETCH_MODE)) {
3534     mon.clog->info() << "disallowing boot of OSD "
3535                       << m->get_orig_source_inst()
3536                       << " because stretch mode is on and OSD lacks support";
3537     goto ignore;
3538   }
3539
3540   // already booted?
3541   if (osdmap.is_up(from) &&
3542       osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()) &&
3543       osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs)) {
3544     // yup.
3545     dout(7) << "preprocess_boot dup from " << m->get_orig_source()
3546             << " " << m->get_orig_source_addrs()
3547             << " =~ " << osdmap.get_addrs(from) << dendl;
3548     _booted(op, false);
3549     return true;
3550   }
3551
3552   if (osdmap.exists(from) &&
3553       !osdmap.get_uuid(from).is_zero() &&
3554       osdmap.get_uuid(from) != m->sb.osd_fsid) {
3555     dout(7) << __func__ << " from " << m->get_orig_source_inst()
3556             << " clashes with existing osd: different fsid"
3557             << " (ours: " << osdmap.get_uuid(from)
3558             << " ; theirs: " << m->sb.osd_fsid << ")" << dendl;
3559     goto ignore;
3560   }
3561
3562   if (osdmap.exists(from) &&
3563       osdmap.get_info(from).up_from > m->version &&
3564       osdmap.get_most_recent_addrs(from).legacy_equals(
3565         m->get_orig_source_addrs())) {
3566     dout(7) << "prepare_boot msg from before last up_from, ignoring" << dendl;
3567     send_latest(op, m->sb.current_epoch+1);
3568     return true;
3569   }
3570
3571   // noup?
3572   if (!can_mark_up(from)) {
3573     dout(7) << "preprocess_boot ignoring boot from " << m->get_orig_source_inst() << dendl;
3574     send_latest(op, m->sb.current_epoch+1);
3575     return true;
3576   }
3577
3578   dout(10) << "preprocess_boot from " << m->get_orig_source_inst() << dendl;
3579   return false;
3580
3581  ignore:
3582   return true;
3583 }
3584
3585 bool OSDMonitor::prepare_boot(MonOpRequestRef op)
3586 {
3587   op->mark_osdmon_event(__func__);
3588   auto m = op->get_req<MOSDBoot>();
3589   dout(7) << __func__ << " from " << m->get_source()
3590           << " sb " << m->sb
3591           << " client_addrs" << m->get_connection()->get_peer_addrs()
3592           << " cluster_addrs " << m->cluster_addrs
3593           << " hb_back_addrs " << m->hb_back_addrs
3594           << " hb_front_addrs " << m->hb_front_addrs
3595           << dendl;
3596
3597   ceph_assert(m->get_orig_source().is_osd());
3598   int from = m->get_orig_source().num();
3599
3600   // does this osd exist?
3601   if (from >= osdmap.get_max_osd()) {
3602     dout(1) << "boot from osd." << from << " >= max_osd "
3603             << osdmap.get_max_osd() << dendl;
3604     return false;
3605   }
3606
3607   int oldstate = osdmap.exists(from) ? osdmap.get_state(from) : CEPH_OSD_NEW;
3608   if (pending_inc.new_state.count(from))
3609     oldstate ^= pending_inc.new_state[from];
3610
3611   // already up?  mark down first?
3612   if (osdmap.is_up(from)) {
3613     dout(7) << __func__ << " was up, first marking down osd." << from << " "
3614             << osdmap.get_addrs(from) << dendl;
3615     // preprocess should have caught these;  if not, assert.
3616     ceph_assert(!osdmap.get_addrs(from).legacy_equals(
3617                   m->get_orig_source_addrs()) ||
3618                 !osdmap.get_cluster_addrs(from).legacy_equals(m->cluster_addrs));
3619     ceph_assert(osdmap.get_uuid(from) == m->sb.osd_fsid);
3620
3621     if (pending_inc.new_state.count(from) == 0 ||
3622         (pending_inc.new_state[from] & CEPH_OSD_UP) == 0) {
3623       // mark previous guy down
3624       pending_inc.new_state[from] = CEPH_OSD_UP;
3625     }
3626     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3627   } else if (pending_inc.new_up_client.count(from)) {
3628     // already prepared, just wait
3629     dout(7) << __func__ << " already prepared, waiting on "
3630             << m->get_orig_source_addr() << dendl;
3631     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
3632   } else {
3633     // mark new guy up.
3634     pending_inc.new_up_client[from] = m->get_orig_source_addrs();
3635     pending_inc.new_up_cluster[from] = m->cluster_addrs;
3636     pending_inc.new_hb_back_up[from] = m->hb_back_addrs;
3637     pending_inc.new_hb_front_up[from] = m->hb_front_addrs;
3638
3639     down_pending_out.erase(from);  // if any
3640
3641     if (m->sb.weight)
3642       osd_weight[from] = m->sb.weight;
3643
3644     // set uuid?
3645     dout(10) << " setting osd." << from << " uuid to " << m->sb.osd_fsid
3646              << dendl;
3647     if (!osdmap.exists(from) || osdmap.get_uuid(from) != m->sb.osd_fsid) {
3648       // preprocess should have caught this;  if not, assert.
3649       ceph_assert(!osdmap.exists(from) || osdmap.get_uuid(from).is_zero());
3650       pending_inc.new_uuid[from] = m->sb.osd_fsid;
3651     }
3652
3653     // fresh osd?
3654     if (m->sb.newest_map == 0 && osdmap.exists(from)) {
3655       const osd_info_t& i = osdmap.get_info(from);
3656       if (i.up_from > i.lost_at) {
3657         dout(10) << " fresh osd; marking lost_at too" << dendl;
3658         pending_inc.new_lost[from] = osdmap.get_epoch();
3659       }
3660     }
3661
3662     // metadata
3663     bufferlist osd_metadata;
3664     encode(m->metadata, osd_metadata);
3665     pending_metadata[from] = osd_metadata;
3666     pending_metadata_rm.erase(from);
3667
3668     // adjust last clean unmount epoch?
3669     const osd_info_t& info = osdmap.get_info(from);
3670     dout(10) << " old osd_info: " << info << dendl;
3671     if (m->sb.mounted > info.last_clean_begin ||
3672         (m->sb.mounted == info.last_clean_begin &&
3673          m->sb.clean_thru > info.last_clean_end)) {
3674       epoch_t begin = m->sb.mounted;
3675       epoch_t end = m->sb.clean_thru;
3676
3677       dout(10) << __func__ << " osd." << from << " last_clean_interval "
3678                << "[" << info.last_clean_begin << "," << info.last_clean_end
3679                << ") -> [" << begin << "-" << end << ")"
3680                << dendl;
3681       pending_inc.new_last_clean_interval[from] =
3682         pair<epoch_t,epoch_t>(begin, end);
3683     }
3684
3685     if (pending_inc.new_xinfo.count(from) == 0)
3686       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
3687     osd_xinfo_t& xi = pending_inc.new_xinfo[from];
3688     if (m->boot_epoch == 0) {
3689       xi.laggy_probability *= (1.0 - g_conf()->mon_osd_laggy_weight);
3690       xi.laggy_interval *= (1.0 - g_conf()->mon_osd_laggy_weight);
3691       dout(10) << " not laggy, new xi " << xi << dendl;
3692     } else {
3693       if (xi.down_stamp.sec()) {
3694         int interval = ceph_clock_now().sec() -
3695           xi.down_stamp.sec();
3696         if (g_conf()->mon_osd_laggy_max_interval &&
3697             (interval > g_conf()->mon_osd_laggy_max_interval)) {
3698           interval =  g_conf()->mon_osd_laggy_max_interval;
3699         }
3700         xi.laggy_interval =
3701           interval * g_conf()->mon_osd_laggy_weight +
3702           xi.laggy_interval * (1.0 - g_conf()->mon_osd_laggy_weight);
3703       }
3704       xi.laggy_probability =
3705         g_conf()->mon_osd_laggy_weight +
3706         xi.laggy_probability * (1.0 - g_conf()->mon_osd_laggy_weight);
3707       dout(10) << " laggy, now xi " << xi << dendl;
3708     }
3709
3710     // set features shared by the osd
3711     if (m->osd_features)
3712       xi.features = m->osd_features;
3713     else
3714       xi.features = m->get_connection()->get_features();
3715
3716     // mark in?
3717     if ((g_conf()->mon_osd_auto_mark_auto_out_in &&
3718          (oldstate & CEPH_OSD_AUTOOUT)) ||
3719         (g_conf()->mon_osd_auto_mark_new_in && (oldstate & CEPH_OSD_NEW)) ||
3720         (g_conf()->mon_osd_auto_mark_in)) {
3721       if (can_mark_in(from)) {
3722         if (xi.old_weight > 0) {
3723           pending_inc.new_weight[from] = xi.old_weight;
3724           xi.old_weight = 0;
3725         } else {
3726           pending_inc.new_weight[from] = CEPH_OSD_IN;
3727         }
3728       } else {
3729         dout(7) << __func__ << " NOIN set, will not mark in "
3730                 << m->get_orig_source_addr() << dendl;
3731       }
3732     }
3733
3734     // wait
3735     wait_for_finished_proposal(op, new C_Booted(this, op));
3736   }
3737   return true;
3738 }
3739
3740 void OSDMonitor::_booted(MonOpRequestRef op, bool logit)
3741 {
3742   op->mark_osdmon_event(__func__);
3743   auto m = op->get_req<MOSDBoot>();
3744   dout(7) << "_booted " << m->get_orig_source_inst()
3745           << " w " << m->sb.weight << " from " << m->sb.current_epoch << dendl;
3746
3747   if (logit) {
3748     mon.clog->info() << m->get_source() << " " << m->get_orig_source_addrs()
3749                       << " boot";
3750   }
3751
3752   send_latest(op, m->sb.current_epoch+1);
3753 }
3754
3755
3756 // -------------
3757 // full
3758
3759 bool OSDMonitor::preprocess_full(MonOpRequestRef op)
3760 {
3761   op->mark_osdmon_event(__func__);
3762   auto m = op->get_req<MOSDFull>();
3763   int from = m->get_orig_source().num();
3764   set<string> state;
3765   unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3766
3767   // check permissions, ignore if failed
3768   MonSession *session = op->get_session();
3769   if (!session)
3770     goto ignore;
3771   if (!session->is_capable("osd", MON_CAP_X)) {
3772     dout(0) << "MOSDFull from entity with insufficient privileges:"
3773             << session->caps << dendl;
3774     goto ignore;
3775   }
3776
3777   // ignore a full message from the osd instance that already went down
3778   if (!osdmap.exists(from)) {
3779     dout(7) << __func__ << " ignoring full message from nonexistent "
3780             << m->get_orig_source_inst() << dendl;
3781     goto ignore;
3782   }
3783   if ((!osdmap.is_up(from) &&
3784        osdmap.get_most_recent_addrs(from).legacy_equals(
3785          m->get_orig_source_addrs())) ||
3786       (osdmap.is_up(from) &&
3787        !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs()))) {
3788     dout(7) << __func__ << " ignoring full message from down "
3789             << m->get_orig_source_inst() << dendl;
3790     goto ignore;
3791   }
3792
3793   OSDMap::calc_state_set(osdmap.get_state(from), state);
3794
3795   if ((osdmap.get_state(from) & mask) == m->state) {
3796     dout(7) << __func__ << " state already " << state << " for osd." << from
3797             << " " << m->get_orig_source_inst() << dendl;
3798     _reply_map(op, m->version);
3799     goto ignore;
3800   }
3801
3802   dout(10) << __func__ << " want state " << state << " for osd." << from
3803            << " " << m->get_orig_source_inst() << dendl;
3804   return false;
3805
3806  ignore:
3807   return true;
3808 }
3809
3810 bool OSDMonitor::prepare_full(MonOpRequestRef op)
3811 {
3812   op->mark_osdmon_event(__func__);
3813   auto m = op->get_req<MOSDFull>();
3814   const int from = m->get_orig_source().num();
3815
3816   const unsigned mask = CEPH_OSD_NEARFULL | CEPH_OSD_BACKFILLFULL | CEPH_OSD_FULL;
3817   const unsigned want_state = m->state & mask;  // safety first
3818
3819   unsigned cur_state = osdmap.get_state(from);
3820   auto p = pending_inc.new_state.find(from);
3821   if (p != pending_inc.new_state.end()) {
3822     cur_state ^= p->second;
3823   }
3824   cur_state &= mask;
3825
3826   set<string> want_state_set, cur_state_set;
3827   OSDMap::calc_state_set(want_state, want_state_set);
3828   OSDMap::calc_state_set(cur_state, cur_state_set);
3829
3830   if (cur_state != want_state) {
3831     if (p != pending_inc.new_state.end()) {
3832       p->second &= ~mask;
3833     } else {
3834       pending_inc.new_state[from] = 0;
3835     }
3836     pending_inc.new_state[from] |= (osdmap.get_state(from) & mask) ^ want_state;
3837     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3838             << " -> " << want_state_set << dendl;
3839   } else {
3840     dout(7) << __func__ << " osd." << from << " " << cur_state_set
3841             << " = wanted " << want_state_set << ", just waiting" << dendl;
3842   }
3843
3844   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3845   return true;
3846 }
3847
3848 // -------------
3849 // alive
3850
3851 bool OSDMonitor::preprocess_alive(MonOpRequestRef op)
3852 {
3853   op->mark_osdmon_event(__func__);
3854   auto m = op->get_req<MOSDAlive>();
3855   int from = m->get_orig_source().num();
3856
3857   // check permissions, ignore if failed
3858   MonSession *session = op->get_session();
3859   if (!session)
3860     goto ignore;
3861   if (!session->is_capable("osd", MON_CAP_X)) {
3862     dout(0) << "attempt to send MOSDAlive from entity with insufficient privileges:"
3863             << session->caps << dendl;
3864     goto ignore;
3865   }
3866
3867   if (!osdmap.is_up(from) ||
3868       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
3869     dout(7) << "preprocess_alive ignoring alive message from down "
3870             << m->get_orig_source() << " " << m->get_orig_source_addrs()
3871             << dendl;
3872     goto ignore;
3873   }
3874
3875   if (osdmap.get_up_thru(from) >= m->want) {
3876     // yup.
3877     dout(7) << "preprocess_alive want up_thru " << m->want << " dup from " << m->get_orig_source_inst() << dendl;
3878     _reply_map(op, m->version);
3879     return true;
3880   }
3881
3882   dout(10) << "preprocess_alive want up_thru " << m->want
3883            << " from " << m->get_orig_source_inst() << dendl;
3884   return false;
3885
3886  ignore:
3887   return true;
3888 }
3889
3890 bool OSDMonitor::prepare_alive(MonOpRequestRef op)
3891 {
3892   op->mark_osdmon_event(__func__);
3893   auto m = op->get_req<MOSDAlive>();
3894   int from = m->get_orig_source().num();
3895
3896   if (0) {  // we probably don't care much about these
3897     mon.clog->debug() << m->get_orig_source_inst() << " alive";
3898   }
3899
3900   dout(7) << "prepare_alive want up_thru " << m->want << " have " << m->version
3901           << " from " << m->get_orig_source_inst() << dendl;
3902
3903   update_up_thru(from, m->version); // set to the latest map the OSD has
3904   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
3905   return true;
3906 }
3907
3908 void OSDMonitor::_reply_map(MonOpRequestRef op, epoch_t e)
3909 {
3910   op->mark_osdmon_event(__func__);
3911   dout(7) << "_reply_map " << e
3912           << " from " << op->get_req()->get_orig_source_inst()
3913           << dendl;
3914   send_latest(op, e);
3915 }
3916
3917 // pg_created
3918 bool OSDMonitor::preprocess_pg_created(MonOpRequestRef op)
3919 {
3920   op->mark_osdmon_event(__func__);
3921   auto m  = op->get_req<MOSDPGCreated>();
3922   dout(10) << __func__ << " " << *m << dendl;
3923   auto session = op->get_session();
3924   mon.no_reply(op);
3925   if (!session) {
3926     dout(10) << __func__ << ": no monitor session!" << dendl;
3927     return true;
3928   }
3929   if (!session->is_capable("osd", MON_CAP_X)) {
3930     derr << __func__ << " received from entity "
3931          << "with insufficient privileges " << session->caps << dendl;
3932     return true;
3933   }
3934   // always forward the "created!" to the leader
3935   return false;
3936 }
3937
3938 bool OSDMonitor::prepare_pg_created(MonOpRequestRef op)
3939 {
3940   op->mark_osdmon_event(__func__);
3941   auto m = op->get_req<MOSDPGCreated>();
3942   dout(10) << __func__ << " " << *m << dendl;
3943   auto src = m->get_orig_source();
3944   auto from = src.num();
3945   if (!src.is_osd() ||
3946       !mon.osdmon()->osdmap.is_up(from) ||
3947       !mon.osdmon()->osdmap.get_addrs(from).legacy_equals(
3948         m->get_orig_source_addrs())) {
3949     dout(1) << __func__ << " ignoring stats from non-active osd." << dendl;
3950     return false;
3951   }
3952   pending_created_pgs.push_back(m->pgid);
3953   return true;
3954 }
3955
3956 bool OSDMonitor::preprocess_pg_ready_to_merge(MonOpRequestRef op)
3957 {
3958   op->mark_osdmon_event(__func__);
3959   auto m = op->get_req<MOSDPGReadyToMerge>();
3960   dout(10) << __func__ << " " << *m << dendl;
3961   const pg_pool_t *pi;
3962   auto session = op->get_session();
3963   if (!session) {
3964     dout(10) << __func__ << ": no monitor session!" << dendl;
3965     goto ignore;
3966   }
3967   if (!session->is_capable("osd", MON_CAP_X)) {
3968     derr << __func__ << " received from entity "
3969          << "with insufficient privileges " << session->caps << dendl;
3970     goto ignore;
3971   }
3972   pi = osdmap.get_pg_pool(m->pgid.pool());
3973   if (!pi) {
3974     derr << __func__ << " pool for " << m->pgid << " dne" << dendl;
3975     goto ignore;
3976   }
3977   if (pi->get_pg_num() <= m->pgid.ps()) {
3978     dout(20) << " pg_num " << pi->get_pg_num() << " already < " << m->pgid << dendl;
3979     goto ignore;
3980   }
3981   if (pi->get_pg_num() != m->pgid.ps() + 1) {
3982     derr << " OSD trying to merge wrong pgid " << m->pgid << dendl;
3983     goto ignore;
3984   }
3985   if (pi->get_pg_num_pending() > m->pgid.ps()) {
3986     dout(20) << " pg_num_pending " << pi->get_pg_num_pending() << " > " << m->pgid << dendl;
3987     goto ignore;
3988   }
3989   return false;
3990
3991  ignore:
3992   mon.no_reply(op);
3993   return true;
3994 }
3995
3996 bool OSDMonitor::prepare_pg_ready_to_merge(MonOpRequestRef op)
3997 {
3998   op->mark_osdmon_event(__func__);
3999   auto m  = op->get_req<MOSDPGReadyToMerge>();
4000   dout(10) << __func__ << " " << *m << dendl;
4001   pg_pool_t p;
4002   if (pending_inc.new_pools.count(m->pgid.pool()))
4003     p = pending_inc.new_pools[m->pgid.pool()];
4004   else
4005     p = *osdmap.get_pg_pool(m->pgid.pool());
4006   if (p.get_pg_num() != m->pgid.ps() + 1 ||
4007       p.get_pg_num_pending() > m->pgid.ps()) {
4008     dout(10) << __func__
4009              << " race with concurrent pg_num[_pending] update, will retry"
4010              << dendl;
4011     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
4012     return true;
4013   }
4014
4015   if (m->ready) {
4016     p.dec_pg_num(m->pgid,
4017                  pending_inc.epoch,
4018                  m->source_version,
4019                  m->target_version,
4020                  m->last_epoch_started,
4021                  m->last_epoch_clean);
4022     p.last_change = pending_inc.epoch;
4023   } else {
4024     // back off the merge attempt!
4025     p.set_pg_num_pending(p.get_pg_num());
4026   }
4027
4028   // force pre-nautilus clients to resend their ops, since they
4029   // don't understand pg_num_pending changes form a new interval
4030   p.last_force_op_resend_prenautilus = pending_inc.epoch;
4031
4032   pending_inc.new_pools[m->pgid.pool()] = p;
4033
4034   auto prob = g_conf().get_val<double>("mon_inject_pg_merge_bounce_probability");
4035   if (m->ready &&
4036       prob > 0 &&
4037       prob > (double)(rand() % 1000)/1000.0) {
4038     derr << __func__ << " injecting pg merge pg_num bounce" << dendl;
4039     auto n = new MMonCommand(mon.monmap->get_fsid());
4040     n->set_connection(m->get_connection());
4041     n->cmd = { "{\"prefix\":\"osd pool set\", \"pool\": \"" +
4042                osdmap.get_pool_name(m->pgid.pool()) +
4043                "\", \"var\": \"pg_num_actual\", \"val\": \"" +
4044                stringify(m->pgid.ps() + 1) + "\"}" };
4045     MonOpRequestRef nop = mon.op_tracker.create_request<MonOpRequest>(n);
4046     nop->set_type_service();
4047     wait_for_finished_proposal(op, new C_RetryMessage(this, nop));
4048   } else {
4049     wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->version));
4050   }
4051   return true;
4052 }
4053
4054
4055 // -------------
4056 // pg_temp changes
4057
4058 bool OSDMonitor::preprocess_pgtemp(MonOpRequestRef op)
4059 {
4060   auto m = op->get_req<MOSDPGTemp>();
4061   dout(10) << "preprocess_pgtemp " << *m << dendl;
4062   mempool::osdmap::vector<int> empty;
4063   int from = m->get_orig_source().num();
4064   size_t ignore_cnt = 0;
4065
4066   // check caps
4067   MonSession *session = op->get_session();
4068   if (!session)
4069     goto ignore;
4070   if (!session->is_capable("osd", MON_CAP_X)) {
4071     dout(0) << "attempt to send MOSDPGTemp from entity with insufficient caps "
4072             << session->caps << dendl;
4073     goto ignore;
4074   }
4075
4076   if (!osdmap.is_up(from) ||
4077       !osdmap.get_addrs(from).legacy_equals(m->get_orig_source_addrs())) {
4078     dout(7) << "ignoring pgtemp message from down "
4079             << m->get_orig_source() << " " << m->get_orig_source_addrs()
4080             << dendl;
4081     goto ignore;
4082   }
4083
4084   if (m->forced) {
4085     return false;
4086   }
4087
4088   for (auto p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4089     dout(20) << " " << p->first
4090              << (osdmap.pg_temp->count(p->first) ? osdmap.pg_temp->get(p->first) : empty)
4091              << " -> " << p->second << dendl;
4092
4093     // does the pool exist?
4094     if (!osdmap.have_pg_pool(p->first.pool())) {
4095       /*
4096        * 1. If the osdmap does not have the pool, it means the pool has been
4097        *    removed in-between the osd sending this message and us handling it.
4098        * 2. If osdmap doesn't have the pool, it is safe to assume the pool does
4099        *    not exist in the pending either, as the osds would not send a
4100        *    message about a pool they know nothing about (yet).
4101        * 3. However, if the pool does exist in the pending, then it must be a
4102        *    new pool, and not relevant to this message (see 1).
4103        */
4104       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4105                << ": pool has been removed" << dendl;
4106       ignore_cnt++;
4107       continue;
4108     }
4109
4110     int acting_primary = -1;
4111     osdmap.pg_to_up_acting_osds(
4112       p->first, nullptr, nullptr, nullptr, &acting_primary);
4113     if (acting_primary != from) {
4114       /* If the source isn't the primary based on the current osdmap, we know
4115        * that the interval changed and that we can discard this message.
4116        * Indeed, we must do so to avoid 16127 since we can't otherwise determine
4117        * which of two pg temp mappings on the same pg is more recent.
4118        */
4119       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4120                << ": primary has changed" << dendl;
4121       ignore_cnt++;
4122       continue;
4123     }
4124
4125     // removal?
4126     if (p->second.empty() && (osdmap.pg_temp->count(p->first) ||
4127                               osdmap.primary_temp->count(p->first)))
4128       return false;
4129     // change?
4130     //  NOTE: we assume that this will clear pg_primary, so consider
4131     //        an existing pg_primary field to imply a change
4132     if (p->second.size() &&
4133         (osdmap.pg_temp->count(p->first) == 0 ||
4134          osdmap.pg_temp->get(p->first) != p->second ||
4135          osdmap.primary_temp->count(p->first)))
4136       return false;
4137   }
4138
4139   // should we ignore all the pgs?
4140   if (ignore_cnt == m->pg_temp.size())
4141     goto ignore;
4142
4143   dout(7) << "preprocess_pgtemp e" << m->map_epoch << " no changes from " << m->get_orig_source_inst() << dendl;
4144   _reply_map(op, m->map_epoch);
4145   return true;
4146
4147  ignore:
4148   mon.no_reply(op);
4149   return true;
4150 }
4151
4152 void OSDMonitor::update_up_thru(int from, epoch_t up_thru)
4153 {
4154   epoch_t old_up_thru = osdmap.get_up_thru(from);
4155   auto ut = pending_inc.new_up_thru.find(from);
4156   if (ut != pending_inc.new_up_thru.end()) {
4157     old_up_thru = ut->second;
4158   }
4159   if (up_thru > old_up_thru) {
4160     // set up_thru too, so the osd doesn't have to ask again
4161     pending_inc.new_up_thru[from] = up_thru;
4162   }
4163 }
4164
4165 bool OSDMonitor::prepare_pgtemp(MonOpRequestRef op)
4166 {
4167   op->mark_osdmon_event(__func__);
4168   auto m = op->get_req<MOSDPGTemp>();
4169   int from = m->get_orig_source().num();
4170   dout(7) << "prepare_pgtemp e" << m->map_epoch << " from " << m->get_orig_source_inst() << dendl;
4171   for (map<pg_t,vector<int32_t> >::iterator p = m->pg_temp.begin(); p != m->pg_temp.end(); ++p) {
4172     uint64_t pool = p->first.pool();
4173     if (pending_inc.old_pools.count(pool)) {
4174       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4175                << ": pool pending removal" << dendl;
4176       continue;
4177     }
4178     if (!osdmap.have_pg_pool(pool)) {
4179       dout(10) << __func__ << " ignore " << p->first << " -> " << p->second
4180                << ": pool has been removed" << dendl;
4181       continue;
4182     }
4183     pending_inc.new_pg_temp[p->first] =
4184       mempool::osdmap::vector<int>(p->second.begin(), p->second.end());
4185
4186     // unconditionally clear pg_primary (until this message can encode
4187     // a change for that, too.. at which point we need to also fix
4188     // preprocess_pg_temp)
4189     if (osdmap.primary_temp->count(p->first) ||
4190         pending_inc.new_primary_temp.count(p->first))
4191       pending_inc.new_primary_temp[p->first] = -1;
4192   }
4193
4194   // set up_thru too, so the osd doesn't have to ask again
4195   update_up_thru(from, m->map_epoch);
4196
4197   wait_for_finished_proposal(op, new C_ReplyMap(this, op, m->map_epoch));
4198   return true;
4199 }
4200
4201
4202 // ---
4203
4204 bool OSDMonitor::preprocess_remove_snaps(MonOpRequestRef op)
4205 {
4206   op->mark_osdmon_event(__func__);
4207   auto m = op->get_req<MRemoveSnaps>();
4208   dout(7) << "preprocess_remove_snaps " << *m << dendl;
4209
4210   // check privilege, ignore if failed
4211   MonSession *session = op->get_session();
4212   mon.no_reply(op);
4213   if (!session)
4214     goto ignore;
4215   if (!session->caps.is_capable(
4216         cct,
4217         session->entity_name,
4218         "osd", "osd pool rmsnap", {}, true, true, false,
4219         session->get_peer_socket_addr())) {
4220     dout(0) << "got preprocess_remove_snaps from entity with insufficient caps "
4221             << session->caps << dendl;
4222     goto ignore;
4223   }
4224
4225   for (map<int, vector<snapid_t> >::iterator q = m->snaps.begin();
4226        q != m->snaps.end();
4227        ++q) {
4228     if (!osdmap.have_pg_pool(q->first)) {
4229       dout(10) << " ignoring removed_snaps " << q->second
4230                << " on non-existent pool " << q->first << dendl;
4231       continue;
4232     }
4233     const pg_pool_t *pi = osdmap.get_pg_pool(q->first);
4234     for (vector<snapid_t>::iterator p = q->second.begin();
4235          p != q->second.end();
4236          ++p) {
4237       if (*p > pi->get_snap_seq() ||
4238           !_is_removed_snap(q->first, *p)) {
4239         return false;
4240       }
4241     }
4242   }
4243
4244   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4245     auto reply = make_message<MRemoveSnaps>();
4246     reply->snaps = m->snaps;
4247     mon.send_reply(op, reply.detach());
4248   }
4249
4250  ignore:
4251   return true;
4252 }
4253
4254 bool OSDMonitor::prepare_remove_snaps(MonOpRequestRef op)
4255 {
4256   op->mark_osdmon_event(__func__);
4257   auto m = op->get_req<MRemoveSnaps>();
4258   dout(7) << "prepare_remove_snaps " << *m << dendl;
4259
4260   for (auto& [pool, snaps] : m->snaps) {
4261     if (!osdmap.have_pg_pool(pool)) {
4262       dout(10) << " ignoring removed_snaps " << snaps
4263                << " on non-existent pool " << pool << dendl;
4264       continue;
4265     }
4266
4267     pg_pool_t& pi = osdmap.pools[pool];
4268     for (auto s : snaps) {
4269       if (!_is_removed_snap(pool, s) &&
4270           (!pending_inc.new_pools.count(pool) ||
4271            !pending_inc.new_pools[pool].removed_snaps.contains(s)) &&
4272           (!pending_inc.new_removed_snaps.count(pool) ||
4273            !pending_inc.new_removed_snaps[pool].contains(s))) {
4274         pg_pool_t *newpi = pending_inc.get_new_pool(pool, &pi);
4275         if (osdmap.require_osd_release < ceph_release_t::octopus) {
4276           newpi->removed_snaps.insert(s);
4277           dout(10) << " pool " << pool << " removed_snaps added " << s
4278                    << " (now " << newpi->removed_snaps << ")" << dendl;
4279         }
4280         newpi->flags |= pg_pool_t::FLAG_SELFMANAGED_SNAPS;
4281         if (s > newpi->get_snap_seq()) {
4282           dout(10) << " pool " << pool << " snap_seq "
4283                    << newpi->get_snap_seq() << " -> " << s << dendl;
4284           newpi->set_snap_seq(s);
4285         }
4286         newpi->set_snap_epoch(pending_inc.epoch);
4287         dout(10) << " added pool " << pool << " snap " << s
4288                  << " to removed_snaps queue" << dendl;
4289         pending_inc.new_removed_snaps[pool].insert(s);
4290       }
4291     }
4292   }
4293
4294   if (HAVE_FEATURE(m->get_connection()->get_features(), SERVER_OCTOPUS)) {
4295     auto reply = make_message<MRemoveSnaps>();
4296     reply->snaps = m->snaps;
4297     wait_for_finished_proposal(op, new C_ReplyOp(this, op, reply));
4298   }
4299
4300   return true;
4301 }
4302
4303 bool OSDMonitor::preprocess_get_purged_snaps(MonOpRequestRef op)
4304 {
4305   op->mark_osdmon_event(__func__);
4306   auto m = op->get_req<MMonGetPurgedSnaps>();
4307   dout(7) << __func__ << " " << *m << dendl;
4308
4309   map<epoch_t,mempool::osdmap::map<int64_t,snap_interval_set_t>> r;
4310
4311   string k = make_purged_snap_epoch_key(m->start);
4312   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
4313   it->upper_bound(k);
4314   unsigned long epoch = m->last;
4315   while (it->valid()) {
4316     if (it->key().find("purged_epoch_") != 0) {
4317       break;
4318     }
4319     string k = it->key();
4320     int n = sscanf(k.c_str(), "purged_epoch_%lx", &epoch);
4321     if (n != 1) {
4322       derr << __func__ << " unable to parse key '" << it->key() << "'" << dendl;
4323     } else if (epoch > m->last) {
4324       break;
4325     } else {
4326       bufferlist bl = it->value();
4327       auto p = bl.cbegin();
4328       auto &v = r[epoch];
4329       try {
4330         ceph::decode(v, p);
4331       } catch (ceph::buffer::error& e) {
4332         derr << __func__ << " unable to parse value for key '" << it->key()
4333              << "': \n";
4334         bl.hexdump(*_dout);
4335         *_dout << dendl;
4336       }
4337       n += 4 + v.size() * 16;
4338     }
4339     if (n > 1048576) {
4340       // impose a semi-arbitrary limit to message size
4341       break;
4342     }
4343     it->next();
4344   }
4345
4346   auto reply = make_message<MMonGetPurgedSnapsReply>(m->start, epoch);
4347   reply->purged_snaps.swap(r);
4348   mon.send_reply(op, reply.detach());
4349
4350   return true;
4351 }
4352
4353 // osd beacon
4354 bool OSDMonitor::preprocess_beacon(MonOpRequestRef op)
4355 {
4356   op->mark_osdmon_event(__func__);
4357   // check caps
4358   auto session = op->get_session();
4359   mon.no_reply(op);
4360   if (!session) {
4361     dout(10) << __func__ << " no monitor session!" << dendl;
4362     return true;
4363   }
4364   if (!session->is_capable("osd", MON_CAP_X)) {
4365     derr << __func__ << " received from entity "
4366          << "with insufficient privileges " << session->caps << dendl;
4367     return true;
4368   }
4369   // Always forward the beacon to the leader, even if they are the same as
4370   // the old one. The leader will mark as down osds that haven't sent
4371   // beacon for a few minutes.
4372   return false;
4373 }
4374
4375 bool OSDMonitor::prepare_beacon(MonOpRequestRef op)
4376 {
4377   op->mark_osdmon_event(__func__);
4378   const auto beacon = op->get_req<MOSDBeacon>();
4379   const auto src = beacon->get_orig_source();
4380   dout(10) << __func__ << " " << *beacon
4381            << " from " << src << dendl;
4382   int from = src.num();
4383
4384   if (!src.is_osd() ||
4385       !osdmap.is_up(from) ||
4386       !osdmap.get_addrs(from).legacy_equals(beacon->get_orig_source_addrs())) {
4387     if (src.is_osd() && !osdmap.is_up(from)) {
4388       // share some new maps with this guy in case it may not be
4389       // aware of its own deadness...
4390       send_latest(op, beacon->version+1);
4391     }
4392     dout(1) << " ignoring beacon from non-active osd." << from << dendl;
4393     return false;
4394   }
4395
4396   last_osd_report[from].first = ceph_clock_now();
4397   last_osd_report[from].second = beacon->osd_beacon_report_interval;
4398   osd_epochs[from] = beacon->version;
4399
4400   for (const auto& pg : beacon->pgs) {
4401     last_epoch_clean.report(pg, beacon->min_last_epoch_clean);
4402   }
4403
4404   if (osdmap.osd_xinfo[from].last_purged_snaps_scrub <
4405       beacon->last_purged_snaps_scrub) {
4406     if (pending_inc.new_xinfo.count(from) == 0) {
4407       pending_inc.new_xinfo[from] = osdmap.osd_xinfo[from];
4408     }
4409     pending_inc.new_xinfo[from].last_purged_snaps_scrub =
4410       beacon->last_purged_snaps_scrub;
4411     return true;
4412   } else {
4413     return false;
4414   }
4415 }
4416
4417 // ---------------
4418 // map helpers
4419
4420 void OSDMonitor::send_latest(MonOpRequestRef op, epoch_t start)
4421 {
4422   op->mark_osdmon_event(__func__);
4423   dout(5) << "send_latest to " << op->get_req()->get_orig_source_inst()
4424           << " start " << start << dendl;
4425   if (start == 0)
4426     send_full(op);
4427   else
4428     send_incremental(op, start);
4429 }
4430
4431
4432 MOSDMap *OSDMonitor::build_latest_full(uint64_t features)
4433 {
4434   MOSDMap *r = new MOSDMap(mon.monmap->fsid, features);
4435   get_version_full(osdmap.get_epoch(), features, r->maps[osdmap.get_epoch()]);
4436   r->oldest_map = get_first_committed();
4437   r->newest_map = osdmap.get_epoch();
4438   return r;
4439 }
4440
4441 MOSDMap *OSDMonitor::build_incremental(epoch_t from, epoch_t to, uint64_t features)
4442 {
4443   dout(10) << "build_incremental [" << from << ".." << to << "] with features "
4444            << std::hex << features << std::dec << dendl;
4445   MOSDMap *m = new MOSDMap(mon.monmap->fsid, features);
4446   m->oldest_map = get_first_committed();
4447   m->newest_map = osdmap.get_epoch();
4448
4449   for (epoch_t e = to; e >= from && e > 0; e--) {
4450     bufferlist bl;
4451     int err = get_version(e, features, bl);
4452     if (err == 0) {
4453       ceph_assert(bl.length());
4454       // if (get_version(e, bl) > 0) {
4455       dout(20) << "build_incremental    inc " << e << " "
4456                << bl.length() << " bytes" << dendl;
4457       m->incremental_maps[e] = bl;
4458     } else {
4459       ceph_assert(err == -ENOENT);
4460       ceph_assert(!bl.length());
4461       get_version_full(e, features, bl);
4462       if (bl.length() > 0) {
4463       //else if (get_version("full", e, bl) > 0) {
4464       dout(20) << "build_incremental   full " << e << " "
4465                << bl.length() << " bytes" << dendl;
4466       m->maps[e] = bl;
4467       } else {
4468         ceph_abort();  // we should have all maps.
4469       }
4470     }
4471   }
4472   return m;
4473 }
4474
4475 void OSDMonitor::send_full(MonOpRequestRef op)
4476 {
4477   op->mark_osdmon_event(__func__);
4478   dout(5) << "send_full to " << op->get_req()->get_orig_source_inst() << dendl;
4479   mon.send_reply(op, build_latest_full(op->get_session()->con_features));
4480 }
4481
4482 void OSDMonitor::send_incremental(MonOpRequestRef op, epoch_t first)
4483 {
4484   op->mark_osdmon_event(__func__);
4485
4486   MonSession *s = op->get_session();
4487   ceph_assert(s);
4488
4489   if (s->proxy_con) {
4490     // oh, we can tell the other mon to do it
4491     dout(10) << __func__ << " asking proxying mon to send_incremental from "
4492              << first << dendl;
4493     MRoute *r = new MRoute(s->proxy_tid, NULL);
4494     r->send_osdmap_first = first;
4495     s->proxy_con->send_message(r);
4496     op->mark_event("reply: send routed send_osdmap_first reply");
4497   } else {
4498     // do it ourselves
4499     send_incremental(first, s, false, op);
4500   }
4501 }
4502
4503 void OSDMonitor::send_incremental(epoch_t first,
4504                                   MonSession *session,
4505                                   bool onetime,
4506                                   MonOpRequestRef req)
4507 {
4508   dout(5) << "send_incremental [" << first << ".." << osdmap.get_epoch() << "]"
4509           << " to " << session->name << dendl;
4510
4511   // get feature of the peer
4512   // use quorum_con_features, if it's an anonymous connection.
4513   uint64_t features = session->con_features ? session->con_features :
4514     mon.get_quorum_con_features();
4515
4516   if (first <= session->osd_epoch) {
4517     dout(10) << __func__ << " " << session->name << " should already have epoch "
4518              << session->osd_epoch << dendl;
4519     first = session->osd_epoch + 1;
4520   }
4521
4522   if (first < get_first_committed()) {
4523     MOSDMap *m = new MOSDMap(osdmap.get_fsid(), features);
4524     m->oldest_map = get_first_committed();
4525     m->newest_map = osdmap.get_epoch();
4526
4527     first = get_first_committed();
4528     bufferlist bl;
4529     int err = get_version_full(first, features, bl);
4530     ceph_assert(err == 0);
4531     ceph_assert(bl.length());
4532     dout(20) << "send_incremental starting with base full "
4533              << first << " " << bl.length() << " bytes" << dendl;
4534     m->maps[first] = bl;
4535
4536     if (req) {
4537       mon.send_reply(req, m);
4538       session->osd_epoch = first;
4539       return;
4540     } else {
4541       session->con->send_message(m);
4542       session->osd_epoch = first;
4543     }
4544     first++;
4545   }
4546
4547   while (first <= osdmap.get_epoch()) {
4548     epoch_t last = std::min<epoch_t>(first + g_conf()->osd_map_message_max - 1,
4549                                      osdmap.get_epoch());
4550     MOSDMap *m = build_incremental(first, last, features);
4551
4552     if (req) {
4553       // send some maps.  it may not be all of them, but it will get them
4554       // started.
4555       mon.send_reply(req, m);
4556     } else {
4557       session->con->send_message(m);
4558       first = last + 1;
4559     }
4560     session->osd_epoch = last;
4561     if (onetime || req)
4562       break;
4563   }
4564 }
4565
4566 int OSDMonitor::get_version(version_t ver, bufferlist& bl)
4567 {
4568   return get_version(ver, mon.get_quorum_con_features(), bl);
4569 }
4570
4571 void OSDMonitor::reencode_incremental_map(bufferlist& bl, uint64_t features)
4572 {
4573   OSDMap::Incremental inc;
4574   auto q = bl.cbegin();
4575   inc.decode(q);
4576   // always encode with subset of osdmap's canonical features
4577   uint64_t f = features & inc.encode_features;
4578   dout(20) << __func__ << " " << inc.epoch << " with features " << f
4579            << dendl;
4580   bl.clear();
4581   if (inc.fullmap.length()) {
4582     // embedded full map?
4583     OSDMap m;
4584     m.decode(inc.fullmap);
4585     inc.fullmap.clear();
4586     m.encode(inc.fullmap, f | CEPH_FEATURE_RESERVED);
4587   }
4588   if (inc.crush.length()) {
4589     // embedded crush map
4590     CrushWrapper c;
4591     auto p = inc.crush.cbegin();
4592     c.decode(p);
4593     inc.crush.clear();
4594     c.encode(inc.crush, f);
4595   }
4596   inc.encode(bl, f | CEPH_FEATURE_RESERVED);
4597 }
4598
4599 void OSDMonitor::reencode_full_map(bufferlist& bl, uint64_t features)
4600 {
4601   OSDMap m;
4602   auto q = bl.cbegin();
4603   m.decode(q);
4604   // always encode with subset of osdmap's canonical features
4605   uint64_t f = features & m.get_encoding_features();
4606   dout(20) << __func__ << " " << m.get_epoch() << " with features " << f
4607            << dendl;
4608   bl.clear();
4609   m.encode(bl, f | CEPH_FEATURE_RESERVED);
4610 }
4611
4612 int OSDMonitor::get_version(version_t ver, uint64_t features, bufferlist& bl)
4613 {
4614   uint64_t significant_features = OSDMap::get_significant_features(features);
4615   if (inc_osd_cache.lookup({ver, significant_features}, &bl)) {
4616     return 0;
4617   }
4618   int ret = PaxosService::get_version(ver, bl);
4619   if (ret < 0) {
4620     return ret;
4621   }
4622   // NOTE: this check is imprecise; the OSDMap encoding features may
4623   // be a subset of the latest mon quorum features, but worst case we
4624   // reencode once and then cache the (identical) result under both
4625   // feature masks.
4626   if (significant_features !=
4627       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4628     reencode_incremental_map(bl, features);
4629   }
4630   inc_osd_cache.add_bytes({ver, significant_features}, bl);
4631   return 0;
4632 }
4633
4634 int OSDMonitor::get_inc(version_t ver, OSDMap::Incremental& inc)
4635 {
4636   bufferlist inc_bl;
4637   int err = get_version(ver, inc_bl);
4638   ceph_assert(err == 0);
4639   ceph_assert(inc_bl.length());
4640
4641   auto p = inc_bl.cbegin();
4642   inc.decode(p);
4643   dout(10) << __func__ << "     "
4644            << " epoch " << inc.epoch
4645            << " inc_crc " << inc.inc_crc
4646            << " full_crc " << inc.full_crc
4647            << " encode_features " << inc.encode_features << dendl;
4648   return 0;
4649 }
4650
4651 int OSDMonitor::get_full_from_pinned_map(version_t ver, bufferlist& bl)
4652 {
4653   dout(10) << __func__ << " ver " << ver << dendl;
4654
4655   version_t closest_pinned = osdmap_manifest.get_lower_closest_pinned(ver);
4656   if (closest_pinned == 0) {
4657     return -ENOENT;
4658   }
4659   if (closest_pinned > ver) {
4660     dout(0) << __func__ << " pinned: " << osdmap_manifest.pinned << dendl;
4661   }
4662   ceph_assert(closest_pinned <= ver);
4663
4664   dout(10) << __func__ << " closest pinned ver " << closest_pinned << dendl;
4665
4666   // get osdmap incremental maps and apply on top of this one.
4667   bufferlist osdm_bl;
4668   bool has_cached_osdmap = false;
4669   for (version_t v = ver-1; v >= closest_pinned; --v) {
4670     if (full_osd_cache.lookup({v, mon.get_quorum_con_features()},
4671                                 &osdm_bl)) {
4672       dout(10) << __func__ << " found map in cache ver " << v << dendl;
4673       closest_pinned = v;
4674       has_cached_osdmap = true;
4675       break;
4676     }
4677   }
4678
4679   if (!has_cached_osdmap) {
4680     int err = PaxosService::get_version_full(closest_pinned, osdm_bl);
4681     if (err != 0) {
4682       derr << __func__ << " closest pinned map ver " << closest_pinned
4683            << " not available! error: " << cpp_strerror(err) << dendl;
4684     }
4685     ceph_assert(err == 0);
4686   }
4687
4688   ceph_assert(osdm_bl.length());
4689
4690   OSDMap osdm;
4691   osdm.decode(osdm_bl);
4692
4693   dout(10) << __func__ << " loaded osdmap epoch " << closest_pinned
4694            << " e" << osdm.epoch
4695            << " crc " << osdm.get_crc()
4696            << " -- applying incremental maps." << dendl;
4697
4698   uint64_t encode_features = 0;
4699   for (version_t v = closest_pinned + 1; v <= ver; ++v) {
4700     dout(20) << __func__ << "    applying inc epoch " << v << dendl;
4701
4702     OSDMap::Incremental inc;
4703     int err = get_inc(v, inc);
4704     ceph_assert(err == 0);
4705
4706     encode_features = inc.encode_features;
4707
4708     err = osdm.apply_incremental(inc);
4709     ceph_assert(err == 0);
4710
4711     // this block performs paranoid checks on map retrieval
4712     if (g_conf().get_val<bool>("mon_debug_extra_checks") &&
4713         inc.full_crc != 0) {
4714
4715       uint64_t f = encode_features;
4716       if (!f) {
4717         f = (mon.quorum_con_features ? mon.quorum_con_features : -1);
4718       }
4719
4720       // encode osdmap to force calculating crcs
4721       bufferlist tbl;
4722       osdm.encode(tbl, f | CEPH_FEATURE_RESERVED);
4723       // decode osdmap to compare crcs with what's expected by incremental
4724       OSDMap tosdm;
4725       tosdm.decode(tbl);
4726
4727       if (tosdm.get_crc() != inc.full_crc) {
4728         derr << __func__
4729              << "    osdmap crc mismatch! (osdmap crc " << tosdm.get_crc()
4730              << ", expected " << inc.full_crc << ")" << dendl;
4731         ceph_abort_msg("osdmap crc mismatch");
4732       }
4733     }
4734
4735     // note: we cannot add the recently computed map to the cache, as is,
4736     // because we have not encoded the map into a bl.
4737   }
4738
4739   if (!encode_features) {
4740     dout(10) << __func__
4741              << " last incremental map didn't have features;"
4742              << " defaulting to quorum's or all" << dendl;
4743     encode_features =
4744       (mon.quorum_con_features ? mon.quorum_con_features : -1);
4745   }
4746   osdm.encode(bl, encode_features | CEPH_FEATURE_RESERVED);
4747
4748   return 0;
4749 }
4750
4751 int OSDMonitor::get_version_full(version_t ver, bufferlist& bl)
4752 {
4753   return get_version_full(ver, mon.get_quorum_con_features(), bl);
4754 }
4755
4756 int OSDMonitor::get_version_full(version_t ver, uint64_t features,
4757                                  bufferlist& bl)
4758 {
4759   uint64_t significant_features = OSDMap::get_significant_features(features);
4760   if (full_osd_cache.lookup({ver, significant_features}, &bl)) {
4761     return 0;
4762   }
4763   int ret = PaxosService::get_version_full(ver, bl);
4764   if (ret == -ENOENT) {
4765     // build map?
4766     ret = get_full_from_pinned_map(ver, bl);
4767   }
4768   if (ret < 0) {
4769     return ret;
4770   }
4771   // NOTE: this check is imprecise; the OSDMap encoding features may
4772   // be a subset of the latest mon quorum features, but worst case we
4773   // reencode once and then cache the (identical) result under both
4774   // feature masks.
4775   if (significant_features !=
4776       OSDMap::get_significant_features(mon.get_quorum_con_features())) {
4777     reencode_full_map(bl, features);
4778   }
4779   full_osd_cache.add_bytes({ver, significant_features}, bl);
4780   return 0;
4781 }
4782
4783 epoch_t OSDMonitor::blocklist(const entity_addrvec_t& av, utime_t until)
4784 {
4785   dout(10) << "blocklist " << av << " until " << until << dendl;
4786   for (auto a : av.v) {
4787     if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4788       a.set_type(entity_addr_t::TYPE_ANY);
4789     } else {
4790       a.set_type(entity_addr_t::TYPE_LEGACY);
4791     }
4792     pending_inc.new_blocklist[a] = until;
4793   }
4794   return pending_inc.epoch;
4795 }
4796
4797 epoch_t OSDMonitor::blocklist(entity_addr_t a, utime_t until)
4798 {
4799   if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
4800     a.set_type(entity_addr_t::TYPE_ANY);
4801   } else {
4802     a.set_type(entity_addr_t::TYPE_LEGACY);
4803   }
4804   dout(10) << "blocklist " << a << " until " << until << dendl;
4805   pending_inc.new_blocklist[a] = until;
4806   return pending_inc.epoch;
4807 }
4808
4809
4810 void OSDMonitor::check_osdmap_subs()
4811 {
4812   dout(10) << __func__ << dendl;
4813   if (!osdmap.get_epoch()) {
4814     return;
4815   }
4816   auto osdmap_subs = mon.session_map.subs.find("osdmap");
4817   if (osdmap_subs == mon.session_map.subs.end()) {
4818     return;
4819   }
4820   auto p = osdmap_subs->second->begin();
4821   while (!p.end()) {
4822     auto sub = *p;
4823     ++p;
4824     check_osdmap_sub(sub);
4825   }
4826 }
4827
4828 void OSDMonitor::check_osdmap_sub(Subscription *sub)
4829 {
4830   dout(10) << __func__ << " " << sub << " next " << sub->next
4831            << (sub->onetime ? " (onetime)":" (ongoing)") << dendl;
4832   if (sub->next <= osdmap.get_epoch()) {
4833     if (sub->next >= 1)
4834       send_incremental(sub->next, sub->session, sub->incremental_onetime);
4835     else
4836       sub->session->con->send_message(build_latest_full(sub->session->con_features));
4837     if (sub->onetime)
4838       mon.session_map.remove_sub(sub);
4839     else
4840       sub->next = osdmap.get_epoch() + 1;
4841   }
4842 }
4843
4844 void OSDMonitor::check_pg_creates_subs()
4845 {
4846   if (!osdmap.get_num_up_osds()) {
4847     return;
4848   }
4849   ceph_assert(osdmap.get_up_osd_features() & CEPH_FEATURE_MON_STATEFUL_SUB);
4850   mon.with_session_map([this](const MonSessionMap& session_map) {
4851       auto pg_creates_subs = session_map.subs.find("osd_pg_creates");
4852       if (pg_creates_subs == session_map.subs.end()) {
4853         return;
4854       }
4855       for (auto sub : *pg_creates_subs->second) {
4856         check_pg_creates_sub(sub);
4857       }
4858     });
4859 }
4860
4861 void OSDMonitor::check_pg_creates_sub(Subscription *sub)
4862 {
4863   dout(20) << __func__ << " .. " << sub->session->name << dendl;
4864   ceph_assert(sub->type == "osd_pg_creates");
4865   // only send these if the OSD is up.  we will check_subs() when they do
4866   // come up so they will get the creates then.
4867   if (sub->session->name.is_osd() &&
4868       mon.osdmon()->osdmap.is_up(sub->session->name.num())) {
4869     sub->next = send_pg_creates(sub->session->name.num(),
4870                                 sub->session->con.get(),
4871                                 sub->next);
4872   }
4873 }
4874
4875 void OSDMonitor::do_application_enable(int64_t pool_id,
4876                                        const std::string &app_name,
4877                                        const std::string &app_key,
4878                                        const std::string &app_value,
4879                                        bool force)
4880 {
4881   ceph_assert(paxos.is_plugged() && is_writeable());
4882
4883   dout(20) << __func__ << ": pool_id=" << pool_id << ", app_name=" << app_name
4884            << dendl;
4885
4886   ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
4887
4888   auto pp = osdmap.get_pg_pool(pool_id);
4889   ceph_assert(pp != nullptr);
4890
4891   pg_pool_t p = *pp;
4892   if (pending_inc.new_pools.count(pool_id)) {
4893     p = pending_inc.new_pools[pool_id];
4894   }
4895
4896   if (app_key.empty()) {
4897     p.application_metadata.insert({app_name, {}});
4898   } else {
4899     if (force) {
4900       p.application_metadata[app_name][app_key] = app_value;
4901     } else {
4902       p.application_metadata.insert({app_name, {{app_key, app_value}}});
4903     }
4904   }
4905   p.last_change = pending_inc.epoch;
4906   pending_inc.new_pools[pool_id] = p;
4907 }
4908
4909 void OSDMonitor::do_set_pool_opt(int64_t pool_id,
4910                                  pool_opts_t::key_t opt,
4911                                  pool_opts_t::value_t val)
4912 {
4913   auto p = pending_inc.new_pools.try_emplace(
4914     pool_id, *osdmap.get_pg_pool(pool_id));
4915   p.first->second.opts.set(opt, val);
4916 }
4917
4918 unsigned OSDMonitor::scan_for_creating_pgs(
4919   const mempool::osdmap::map<int64_t,pg_pool_t>& pools,
4920   const mempool::osdmap::set<int64_t>& removed_pools,
4921   utime_t modified,
4922   creating_pgs_t* creating_pgs) const
4923 {
4924   unsigned queued = 0;
4925   for (auto& p : pools) {
4926     int64_t poolid = p.first;
4927     if (creating_pgs->created_pools.count(poolid)) {
4928       dout(10) << __func__ << " already created " << poolid << dendl;
4929       continue;
4930     }
4931     const pg_pool_t& pool = p.second;
4932     int ruleno = osdmap.crush->find_rule(pool.get_crush_rule(),
4933                                          pool.get_type(), pool.get_size());
4934     if (ruleno < 0 || !osdmap.crush->rule_exists(ruleno))
4935       continue;
4936
4937     const auto last_scan_epoch = creating_pgs->last_scan_epoch;
4938     const auto created = pool.get_last_change();
4939     if (last_scan_epoch && created <= last_scan_epoch) {
4940       dout(10) << __func__ << " no change in pool " << poolid
4941                << " " << pool << dendl;
4942       continue;
4943     }
4944     if (removed_pools.count(poolid)) {
4945       dout(10) << __func__ << " pool is being removed: " << poolid
4946                << " " << pool << dendl;
4947       continue;
4948     }
4949     dout(10) << __func__ << " queueing pool create for " << poolid
4950              << " " << pool << dendl;
4951     creating_pgs->create_pool(poolid, pool.get_pg_num(),
4952                               created, modified);
4953     queued++;
4954   }
4955   return queued;
4956 }
4957
4958 void OSDMonitor::update_creating_pgs()
4959 {
4960   dout(10) << __func__ << " " << creating_pgs.pgs.size() << " pgs creating, "
4961            << creating_pgs.queue.size() << " pools in queue" << dendl;
4962   decltype(creating_pgs_by_osd_epoch) new_pgs_by_osd_epoch;
4963   std::lock_guard<std::mutex> l(creating_pgs_lock);
4964   for (const auto& pg : creating_pgs.pgs) {
4965     int acting_primary = -1;
4966     auto pgid = pg.first;
4967     if (!osdmap.pg_exists(pgid)) {
4968       dout(20) << __func__ << " ignoring " << pgid << " which should not exist"
4969                << dendl;
4970       continue;
4971     }
4972     auto mapped = pg.second.create_epoch;
4973     dout(20) << __func__ << " looking up " << pgid << "@" << mapped << dendl;
4974     spg_t spgid(pgid);
4975     mapping.get_primary_and_shard(pgid, &acting_primary, &spgid);
4976     // check the previous creating_pgs, look for the target to whom the pg was
4977     // previously mapped
4978     for (const auto& pgs_by_epoch : creating_pgs_by_osd_epoch) {
4979       const auto last_acting_primary = pgs_by_epoch.first;
4980       for (auto& pgs: pgs_by_epoch.second) {
4981         if (pgs.second.count(spgid)) {
4982           if (last_acting_primary == acting_primary) {
4983             mapped = pgs.first;
4984           } else {
4985             dout(20) << __func__ << " " << pgid << " "
4986                      << " acting_primary:" << last_acting_primary
4987                      << " -> " << acting_primary << dendl;
4988             // note epoch if the target of the create message changed.
4989             mapped = mapping.get_epoch();
4990           }
4991           break;
4992         } else {
4993           // newly creating
4994           mapped = mapping.get_epoch();
4995         }
4996       }
4997     }
4998     dout(10) << __func__ << " will instruct osd." << acting_primary
4999              << " to create " << pgid << "@" << mapped << dendl;
5000     new_pgs_by_osd_epoch[acting_primary][mapped].insert(spgid);
5001   }
5002   creating_pgs_by_osd_epoch = std::move(new_pgs_by_osd_epoch);
5003   creating_pgs_epoch = mapping.get_epoch();
5004 }
5005
5006 epoch_t OSDMonitor::send_pg_creates(int osd, Connection *con, epoch_t next) const
5007 {
5008   dout(30) << __func__ << " osd." << osd << " next=" << next
5009            << " " << creating_pgs_by_osd_epoch << dendl;
5010   std::lock_guard<std::mutex> l(creating_pgs_lock);
5011   if (creating_pgs_epoch <= creating_pgs.last_scan_epoch) {
5012     dout(20) << __func__
5013              << " not using stale creating_pgs@" << creating_pgs_epoch << dendl;
5014     // the subscribers will be updated when the mapping is completed anyway
5015     return next;
5016   }
5017   auto creating_pgs_by_epoch = creating_pgs_by_osd_epoch.find(osd);
5018   if (creating_pgs_by_epoch == creating_pgs_by_osd_epoch.end())
5019     return next;
5020   ceph_assert(!creating_pgs_by_epoch->second.empty());
5021
5022   MOSDPGCreate *oldm = nullptr; // for pre-mimic OSD compat
5023   MOSDPGCreate2 *m = nullptr;
5024
5025   bool old = osdmap.require_osd_release < ceph_release_t::nautilus;
5026
5027   epoch_t last = 0;
5028   for (auto epoch_pgs = creating_pgs_by_epoch->second.lower_bound(next);
5029        epoch_pgs != creating_pgs_by_epoch->second.end(); ++epoch_pgs) {
5030     auto epoch = epoch_pgs->first;
5031     auto& pgs = epoch_pgs->second;
5032     dout(20) << __func__ << " osd." << osd << " from " << next
5033              << " : epoch " << epoch << " " << pgs.size() << " pgs" << dendl;
5034     last = epoch;
5035     for (auto& pg : pgs) {
5036       // Need the create time from the monitor using its clock to set
5037       // last_scrub_stamp upon pg creation.
5038       auto create = creating_pgs.pgs.find(pg.pgid);
5039       ceph_assert(create != creating_pgs.pgs.end());
5040       if (old) {
5041         if (!oldm) {
5042           oldm = new MOSDPGCreate(creating_pgs_epoch);
5043         }
5044         oldm->mkpg.emplace(pg.pgid,
5045                            pg_create_t{create->second.create_epoch, pg.pgid, 0});
5046         oldm->ctimes.emplace(pg.pgid, create->second.create_stamp);
5047       } else {
5048         if (!m) {
5049           m = new MOSDPGCreate2(creating_pgs_epoch);
5050         }
5051         m->pgs.emplace(pg, make_pair(create->second.create_epoch,
5052                                      create->second.create_stamp));
5053         if (create->second.history.epoch_created) {
5054           dout(20) << __func__ << "   " << pg << " " << create->second.history
5055                    << " " << create->second.past_intervals << dendl;
5056           m->pg_extra.emplace(pg, make_pair(create->second.history,
5057                                             create->second.past_intervals));
5058         }
5059       }
5060       dout(20) << __func__ << " will create " << pg
5061                << " at " << create->second.create_epoch << dendl;
5062     }
5063   }
5064   if (m) {
5065     con->send_message(m);
5066   } else if (oldm) {
5067     con->send_message(oldm);
5068   } else {
5069     dout(20) << __func__ << " osd." << osd << " from " << next
5070              << " has nothing to send" << dendl;
5071     return next;
5072   }
5073
5074   // sub is current through last + 1
5075   return last + 1;
5076 }
5077
5078 // TICK
5079
5080
5081 void OSDMonitor::tick()
5082 {
5083   if (!is_active()) return;
5084
5085   dout(10) << osdmap << dendl;
5086
5087   // always update osdmap manifest, regardless of being the leader.
5088   load_osdmap_manifest();
5089
5090   // always tune priority cache manager memory on leader and peons
5091   if (ceph_using_tcmalloc() && mon_memory_autotune) {
5092     std::lock_guard l(balancer_lock);
5093     if (pcm != nullptr) {
5094       pcm->tune_memory();
5095       pcm->balance();
5096       _set_new_cache_sizes();
5097       dout(10) << "tick balancer "
5098                << " inc cache_bytes: " << inc_cache->get_cache_bytes()
5099                << " inc comtd_bytes: " << inc_cache->get_committed_size()
5100                << " inc used_bytes: " << inc_cache->_get_used_bytes()
5101                << " inc num_osdmaps: " << inc_cache->_get_num_osdmaps()
5102                << dendl;
5103       dout(10) << "tick balancer "
5104                << " full cache_bytes: " << full_cache->get_cache_bytes()
5105                << " full comtd_bytes: " << full_cache->get_committed_size()
5106                << " full used_bytes: " << full_cache->_get_used_bytes()
5107                << " full num_osdmaps: " << full_cache->_get_num_osdmaps()
5108                << dendl;
5109     }
5110   }
5111
5112   if (!mon.is_leader()) return;
5113
5114   bool do_propose = false;
5115   utime_t now = ceph_clock_now();
5116
5117   if (handle_osd_timeouts(now, last_osd_report)) {
5118     do_propose = true;
5119   }
5120
5121   // mark osds down?
5122   if (check_failures(now)) {
5123     do_propose = true;
5124   }
5125
5126   // Force a proposal if we need to prune; pruning is performed on
5127   // ``encode_pending()``, hence why we need to regularly trigger a proposal
5128   // even if there's nothing going on.
5129   if (is_prune_enabled() && should_prune()) {
5130     do_propose = true;
5131   }
5132
5133   // mark down osds out?
5134
5135   /* can_mark_out() checks if we can mark osds as being out. The -1 has no
5136    * influence at all. The decision is made based on the ratio of "in" osds,
5137    * and the function returns false if this ratio is lower that the minimum
5138    * ratio set by g_conf()->mon_osd_min_in_ratio. So it's not really up to us.
5139    */
5140   if (can_mark_out(-1)) {
5141     string down_out_subtree_limit = g_conf().get_val<string>(
5142       "mon_osd_down_out_subtree_limit");
5143     set<int> down_cache;  // quick cache of down subtrees
5144
5145     map<int,utime_t>::iterator i = down_pending_out.begin();
5146     while (i != down_pending_out.end()) {
5147       int o = i->first;
5148       utime_t down = now;
5149       down -= i->second;
5150       ++i;
5151
5152       if (osdmap.is_down(o) &&
5153           osdmap.is_in(o) &&
5154           can_mark_out(o)) {
5155         utime_t orig_grace(g_conf()->mon_osd_down_out_interval, 0);
5156         utime_t grace = orig_grace;
5157         double my_grace = 0.0;
5158
5159         if (g_conf()->mon_osd_adjust_down_out_interval) {
5160           // scale grace period the same way we do the heartbeat grace.
5161           const osd_xinfo_t& xi = osdmap.get_xinfo(o);
5162           double halflife = (double)g_conf()->mon_osd_laggy_halflife;
5163           double decay_k = ::log(.5) / halflife;
5164           double decay = exp((double)down * decay_k);
5165           dout(20) << "osd." << o << " laggy halflife " << halflife << " decay_k " << decay_k
5166                    << " down for " << down << " decay " << decay << dendl;
5167           my_grace = decay * (double)xi.laggy_interval * xi.laggy_probability;
5168           grace += my_grace;
5169         }
5170
5171         // is this an entire large subtree down?
5172         if (down_out_subtree_limit.length()) {
5173           int type = osdmap.crush->get_type_id(down_out_subtree_limit);
5174           if (type > 0) {
5175             if (osdmap.containing_subtree_is_down(cct, o, type, &down_cache)) {
5176               dout(10) << "tick entire containing " << down_out_subtree_limit
5177                        << " subtree for osd." << o
5178                        << " is down; resetting timer" << dendl;
5179               // reset timer, too.
5180               down_pending_out[o] = now;
5181               continue;
5182             }
5183           }
5184         }
5185
5186         bool down_out = !osdmap.is_destroyed(o) &&
5187           g_conf()->mon_osd_down_out_interval > 0 && down.sec() >= grace;
5188         bool destroyed_out = osdmap.is_destroyed(o) &&
5189           g_conf()->mon_osd_destroyed_out_interval > 0 &&
5190         // this is not precise enough as we did not make a note when this osd
5191         // was marked as destroyed, but let's not bother with that
5192         // complexity for now.
5193           down.sec() >= g_conf()->mon_osd_destroyed_out_interval;
5194         if (down_out || destroyed_out) {
5195           dout(10) << "tick marking osd." << o << " OUT after " << down
5196                    << " sec (target " << grace << " = " << orig_grace << " + " << my_grace << ")" << dendl;
5197           pending_inc.new_weight[o] = CEPH_OSD_OUT;
5198
5199           // set the AUTOOUT bit.
5200           if (pending_inc.new_state.count(o) == 0)
5201             pending_inc.new_state[o] = 0;
5202           pending_inc.new_state[o] |= CEPH_OSD_AUTOOUT;
5203
5204           // remember previous weight
5205           if (pending_inc.new_xinfo.count(o) == 0)
5206             pending_inc.new_xinfo[o] = osdmap.osd_xinfo[o];
5207           pending_inc.new_xinfo[o].old_weight = osdmap.osd_weight[o];
5208
5209           do_propose = true;
5210
5211           mon.clog->info() << "Marking osd." << o << " out (has been down for "
5212                             << int(down.sec()) << " seconds)";
5213         } else
5214           continue;
5215       }
5216
5217       down_pending_out.erase(o);
5218     }
5219   } else {
5220     dout(10) << "tick NOOUT flag set, not checking down osds" << dendl;
5221   }
5222
5223   // expire blocklisted items?
5224   for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5225        p != osdmap.blocklist.end();
5226        ++p) {
5227     if (p->second < now) {
5228       dout(10) << "expiring blocklist item " << p->first << " expired " << p->second << " < now " << now << dendl;
5229       pending_inc.old_blocklist.push_back(p->first);
5230       do_propose = true;
5231     }
5232   }
5233
5234   if (try_prune_purged_snaps()) {
5235     do_propose = true;
5236   }
5237
5238   if (update_pools_status())
5239     do_propose = true;
5240
5241   if (do_propose ||
5242       !pending_inc.new_pg_temp.empty())  // also propose if we adjusted pg_temp
5243     propose_pending();
5244 }
5245
5246 void OSDMonitor::_set_new_cache_sizes()
5247 {
5248   uint64_t cache_size = 0;
5249   int64_t inc_alloc = 0;
5250   int64_t full_alloc = 0;
5251   int64_t kv_alloc = 0;
5252
5253   if (pcm != nullptr && rocksdb_binned_kv_cache != nullptr) {
5254     cache_size = pcm->get_tuned_mem();
5255     inc_alloc = inc_cache->get_committed_size();
5256     full_alloc = full_cache->get_committed_size();
5257     kv_alloc = rocksdb_binned_kv_cache->get_committed_size();
5258   }
5259
5260   inc_osd_cache.set_bytes(inc_alloc);
5261   full_osd_cache.set_bytes(full_alloc);
5262
5263   dout(1) << __func__ << " cache_size:" << cache_size
5264            << " inc_alloc: " << inc_alloc
5265            << " full_alloc: " << full_alloc
5266            << " kv_alloc: " << kv_alloc
5267            << dendl;
5268 }
5269
5270 bool OSDMonitor::handle_osd_timeouts(const utime_t &now,
5271                                      std::map<int, std::pair<utime_t, int>> &last_osd_report)
5272 {
5273   utime_t timeo(g_conf()->mon_osd_report_timeout, 0);
5274   if (now - mon.get_leader_since() < timeo) {
5275     // We haven't been the leader for long enough to consider OSD timeouts
5276     return false;
5277   }
5278
5279   int max_osd = osdmap.get_max_osd();
5280   bool new_down = false;
5281
5282   for (int i=0; i < max_osd; ++i) {
5283     dout(30) << __func__ << ": checking up on osd " << i << dendl;
5284     if (!osdmap.exists(i)) {
5285       last_osd_report.erase(i); // if any
5286       continue;
5287     }
5288     if (!osdmap.is_up(i))
5289       continue;
5290     const std::map<int, std::pair<utime_t, int>>::const_iterator t = last_osd_report.find(i);
5291     if (t == last_osd_report.end()) {
5292       // it wasn't in the map; start the timer.
5293       last_osd_report[i].first = now;
5294       last_osd_report[i].second = 0;
5295     } else if (can_mark_down(i)) {
5296       utime_t diff = now - t->second.first;
5297       // we use the max(mon_osd_report_timeout, 2*osd_beacon_report_interval) as timeout
5298       // to allow for the osd to miss a beacon.
5299       int mon_osd_report_timeout = g_conf()->mon_osd_report_timeout;
5300       utime_t max_timeout(std::max(mon_osd_report_timeout,  2 * t->second.second), 0);
5301       if (diff > max_timeout) {
5302         mon.clog->info() << "osd." << i << " marked down after no beacon for "
5303                           << diff << " seconds";
5304         derr << "no beacon from osd." << i << " since " << t->second.first
5305              << ", " << diff << " seconds ago.  marking down" << dendl;
5306         pending_inc.new_state[i] = CEPH_OSD_UP;
5307         new_down = true;
5308       }
5309     }
5310   }
5311   return new_down;
5312 }
5313
5314 static void dump_cpu_list(Formatter *f, const char *name,
5315                           const string& strlist)
5316 {
5317   cpu_set_t cpu_set;
5318   size_t cpu_set_size;
5319   if (parse_cpu_set_list(strlist.c_str(), &cpu_set_size, &cpu_set) < 0) {
5320     return;
5321   }
5322   set<int> cpus = cpu_set_to_set(cpu_set_size, &cpu_set);
5323   f->open_array_section(name);
5324   for (auto cpu : cpus) {
5325     f->dump_int("cpu", cpu);
5326   }
5327   f->close_section();
5328 }
5329
5330 void OSDMonitor::dump_info(Formatter *f)
5331 {
5332   f->open_object_section("osdmap");
5333   osdmap.dump(f);
5334   f->close_section();
5335
5336   f->open_array_section("osd_metadata");
5337   for (int i=0; i<osdmap.get_max_osd(); ++i) {
5338     if (osdmap.exists(i)) {
5339       f->open_object_section("osd");
5340       f->dump_unsigned("id", i);
5341       dump_osd_metadata(i, f, NULL);
5342       f->close_section();
5343     }
5344   }
5345   f->close_section();
5346
5347   f->open_object_section("osdmap_clean_epochs");
5348   f->dump_unsigned("min_last_epoch_clean", get_min_last_epoch_clean());
5349
5350   f->open_object_section("last_epoch_clean");
5351   last_epoch_clean.dump(f);
5352   f->close_section();
5353
5354   f->open_array_section("osd_epochs");
5355   for (auto& osd_epoch : osd_epochs) {
5356     f->open_object_section("osd");
5357     f->dump_unsigned("id", osd_epoch.first);
5358     f->dump_unsigned("epoch", osd_epoch.second);
5359     f->close_section();
5360   }
5361   f->close_section(); // osd_epochs
5362
5363   f->close_section(); // osd_clean_epochs
5364
5365   f->dump_unsigned("osdmap_first_committed", get_first_committed());
5366   f->dump_unsigned("osdmap_last_committed", get_last_committed());
5367
5368   f->open_object_section("crushmap");
5369   osdmap.crush->dump(f);
5370   f->close_section();
5371
5372   if (has_osdmap_manifest) {
5373     f->open_object_section("osdmap_manifest");
5374     osdmap_manifest.dump(f);
5375     f->close_section();
5376   }
5377 }
5378
5379 namespace {
5380   enum osd_pool_get_choices {
5381     SIZE, MIN_SIZE,
5382     PG_NUM, PGP_NUM, CRUSH_RULE, HASHPSPOOL, EC_OVERWRITES,
5383     NODELETE, NOPGCHANGE, NOSIZECHANGE,
5384     WRITE_FADVISE_DONTNEED, NOSCRUB, NODEEP_SCRUB,
5385     HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
5386     USE_GMT_HITSET, TARGET_MAX_OBJECTS, TARGET_MAX_BYTES,
5387     CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
5388     CACHE_TARGET_FULL_RATIO,
5389     CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
5390     ERASURE_CODE_PROFILE, MIN_READ_RECENCY_FOR_PROMOTE,
5391     MIN_WRITE_RECENCY_FOR_PROMOTE, FAST_READ,
5392     HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N,
5393     SCRUB_MIN_INTERVAL, SCRUB_MAX_INTERVAL, DEEP_SCRUB_INTERVAL,
5394     RECOVERY_PRIORITY, RECOVERY_OP_PRIORITY, SCRUB_PRIORITY,
5395     COMPRESSION_MODE, COMPRESSION_ALGORITHM, COMPRESSION_REQUIRED_RATIO,
5396     COMPRESSION_MAX_BLOB_SIZE, COMPRESSION_MIN_BLOB_SIZE,
5397     CSUM_TYPE, CSUM_MAX_BLOCK, CSUM_MIN_BLOCK, FINGERPRINT_ALGORITHM,
5398     PG_AUTOSCALE_MODE, PG_NUM_MIN, TARGET_SIZE_BYTES, TARGET_SIZE_RATIO,
5399     PG_AUTOSCALE_BIAS, DEDUP_TIER, DEDUP_CHUNK_ALGORITHM,
5400     DEDUP_CDC_CHUNK_SIZE };
5401
5402   std::set<osd_pool_get_choices>
5403     subtract_second_from_first(const std::set<osd_pool_get_choices>& first,
5404                                 const std::set<osd_pool_get_choices>& second)
5405     {
5406       std::set<osd_pool_get_choices> result;
5407       std::set_difference(first.begin(), first.end(),
5408                           second.begin(), second.end(),
5409                           std::inserter(result, result.end()));
5410       return result;
5411     }
5412 }
5413
5414
5415 bool OSDMonitor::preprocess_command(MonOpRequestRef op)
5416 {
5417   op->mark_osdmon_event(__func__);
5418   auto m = op->get_req<MMonCommand>();
5419   int r = 0;
5420   bufferlist rdata;
5421   stringstream ss, ds;
5422
5423   cmdmap_t cmdmap;
5424   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
5425     string rs = ss.str();
5426     mon.reply_command(op, -EINVAL, rs, get_last_committed());
5427     return true;
5428   }
5429
5430   MonSession *session = op->get_session();
5431   if (!session) {
5432     derr << __func__ << " no session" << dendl;
5433     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
5434     return true;
5435   }
5436
5437   string prefix;
5438   cmd_getval(cmdmap, "prefix", prefix);
5439
5440   string format;
5441   cmd_getval(cmdmap, "format", format, string("plain"));
5442   boost::scoped_ptr<Formatter> f(Formatter::create(format));
5443
5444   if (prefix == "osd stat") {
5445     if (f) {
5446       f->open_object_section("osdmap");
5447       osdmap.print_summary(f.get(), ds, "", true);
5448       f->close_section();
5449       f->flush(rdata);
5450     } else {
5451       osdmap.print_summary(nullptr, ds, "", true);
5452       rdata.append(ds);
5453     }
5454   }
5455   else if (prefix == "osd dump" ||
5456            prefix == "osd tree" ||
5457            prefix == "osd tree-from" ||
5458            prefix == "osd ls" ||
5459            prefix == "osd getmap" ||
5460            prefix == "osd getcrushmap" ||
5461            prefix == "osd ls-tree" ||
5462            prefix == "osd info") {
5463
5464     epoch_t epoch = 0;
5465     int64_t epochnum;
5466     cmd_getval(cmdmap, "epoch", epochnum, (int64_t)osdmap.get_epoch());
5467     epoch = epochnum;
5468
5469     bufferlist osdmap_bl;
5470     int err = get_version_full(epoch, osdmap_bl);
5471     if (err == -ENOENT) {
5472       r = -ENOENT;
5473       ss << "there is no map for epoch " << epoch;
5474       goto reply;
5475     }
5476     ceph_assert(err == 0);
5477     ceph_assert(osdmap_bl.length());
5478
5479     OSDMap *p;
5480     if (epoch == osdmap.get_epoch()) {
5481       p = &osdmap;
5482     } else {
5483       p = new OSDMap;
5484       p->decode(osdmap_bl);
5485     }
5486
5487     auto sg = make_scope_guard([&] {
5488       if (p != &osdmap) {
5489         delete p;
5490       }
5491     });
5492
5493     if (prefix == "osd dump") {
5494       stringstream ds;
5495       if (f) {
5496         f->open_object_section("osdmap");
5497         p->dump(f.get());
5498         f->close_section();
5499         f->flush(ds);
5500       } else {
5501         p->print(ds);
5502       }
5503       rdata.append(ds);
5504       if (!f)
5505         ds << " ";
5506     } else if (prefix == "osd ls") {
5507       if (f) {
5508         f->open_array_section("osds");
5509         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5510           if (osdmap.exists(i)) {
5511             f->dump_int("osd", i);
5512           }
5513         }
5514         f->close_section();
5515         f->flush(ds);
5516       } else {
5517         bool first = true;
5518         for (int i = 0; i < osdmap.get_max_osd(); i++) {
5519           if (osdmap.exists(i)) {
5520             if (!first)
5521               ds << "\n";
5522             first = false;
5523             ds << i;
5524           }
5525         }
5526       }
5527       rdata.append(ds);
5528     } else if (prefix == "osd info") {
5529       int64_t osd_id;
5530       bool do_single_osd = true;
5531       if (!cmd_getval(cmdmap, "id", osd_id)) {
5532         do_single_osd = false;
5533       }
5534
5535       if (do_single_osd && !osdmap.exists(osd_id)) {
5536         ss << "osd." << osd_id << " does not exist";
5537         r = -EINVAL;
5538         goto reply;
5539       }
5540
5541       if (f) {
5542         if (do_single_osd) {
5543           osdmap.dump_osd(osd_id, f.get());
5544         } else {
5545           osdmap.dump_osds(f.get());
5546         }
5547         f->flush(ds);
5548       } else {
5549         if (do_single_osd) {
5550           osdmap.print_osd(osd_id, ds);
5551         } else {
5552           osdmap.print_osds(ds);
5553         }
5554       }
5555       rdata.append(ds);
5556     } else if (prefix == "osd tree" || prefix == "osd tree-from") {
5557       string bucket;
5558       if (prefix == "osd tree-from") {
5559         cmd_getval(cmdmap, "bucket", bucket);
5560         if (!osdmap.crush->name_exists(bucket)) {
5561           ss << "bucket '" << bucket << "' does not exist";
5562           r = -ENOENT;
5563           goto reply;
5564         }
5565         int id = osdmap.crush->get_item_id(bucket);
5566         if (id >= 0) {
5567           ss << "\"" << bucket << "\" is not a bucket";
5568           r = -EINVAL;
5569           goto reply;
5570         }
5571       }
5572
5573       vector<string> states;
5574       cmd_getval(cmdmap, "states", states);
5575       unsigned filter = 0;
5576       for (auto& s : states) {
5577         if (s == "up") {
5578           filter |= OSDMap::DUMP_UP;
5579         } else if (s == "down") {
5580           filter |= OSDMap::DUMP_DOWN;
5581         } else if (s == "in") {
5582           filter |= OSDMap::DUMP_IN;
5583         } else if (s == "out") {
5584           filter |= OSDMap::DUMP_OUT;
5585         } else if (s == "destroyed") {
5586           filter |= OSDMap::DUMP_DESTROYED;
5587         } else {
5588           ss << "unrecognized state '" << s << "'";
5589           r = -EINVAL;
5590           goto reply;
5591         }
5592       }
5593       if ((filter & (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) ==
5594           (OSDMap::DUMP_IN|OSDMap::DUMP_OUT)) {
5595         ss << "cannot specify both 'in' and 'out'";
5596         r = -EINVAL;
5597         goto reply;
5598       }
5599       if (((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ==
5600            (OSDMap::DUMP_UP|OSDMap::DUMP_DOWN)) ||
5601            ((filter & (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ==
5602            (OSDMap::DUMP_UP|OSDMap::DUMP_DESTROYED)) ||
5603            ((filter & (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED)) ==
5604            (OSDMap::DUMP_DOWN|OSDMap::DUMP_DESTROYED))) {
5605         ss << "can specify only one of 'up', 'down' and 'destroyed'";
5606         r = -EINVAL;
5607         goto reply;
5608       }
5609       if (f) {
5610         f->open_object_section("tree");
5611         p->print_tree(f.get(), NULL, filter, bucket);
5612         f->close_section();
5613         f->flush(ds);
5614       } else {
5615         p->print_tree(NULL, &ds, filter, bucket);
5616       }
5617       rdata.append(ds);
5618     } else if (prefix == "osd getmap") {
5619       rdata.append(osdmap_bl);
5620       ss << "got osdmap epoch " << p->get_epoch();
5621     } else if (prefix == "osd getcrushmap") {
5622       p->crush->encode(rdata, mon.get_quorum_con_features());
5623       ss << p->get_crush_version();
5624     } else if (prefix == "osd ls-tree") {
5625       string bucket_name;
5626       cmd_getval(cmdmap, "name", bucket_name);
5627       set<int> osds;
5628       r = p->get_osds_by_bucket_name(bucket_name, &osds);
5629       if (r == -ENOENT) {
5630         ss << "\"" << bucket_name << "\" does not exist";
5631         goto reply;
5632       } else if (r < 0) {
5633         ss << "can not parse bucket name:\"" << bucket_name << "\"";
5634         goto reply;
5635       }
5636
5637       if (f) {
5638         f->open_array_section("osds");
5639         for (auto &i : osds) {
5640           if (osdmap.exists(i)) {
5641             f->dump_int("osd", i);
5642           }
5643         }
5644         f->close_section();
5645         f->flush(ds);
5646       } else {
5647         bool first = true;
5648         for (auto &i : osds) {
5649           if (osdmap.exists(i)) {
5650             if (!first)
5651               ds << "\n";
5652             first = false;
5653             ds << i;
5654           }
5655         }
5656       }
5657
5658       rdata.append(ds);
5659     }
5660   } else if (prefix == "osd getmaxosd") {
5661     if (f) {
5662       f->open_object_section("getmaxosd");
5663       f->dump_unsigned("epoch", osdmap.get_epoch());
5664       f->dump_int("max_osd", osdmap.get_max_osd());
5665       f->close_section();
5666       f->flush(rdata);
5667     } else {
5668       ds << "max_osd = " << osdmap.get_max_osd() << " in epoch " << osdmap.get_epoch();
5669       rdata.append(ds);
5670     }
5671   } else if (prefix == "osd utilization") {
5672     string out;
5673     osdmap.summarize_mapping_stats(NULL, NULL, &out, f.get());
5674     if (f)
5675       f->flush(rdata);
5676     else
5677       rdata.append(out);
5678     r = 0;
5679     goto reply;
5680   } else if (prefix  == "osd find") {
5681     int64_t osd;
5682     if (!cmd_getval(cmdmap, "id", osd)) {
5683       ss << "unable to parse osd id value '"
5684          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5685       r = -EINVAL;
5686       goto reply;
5687     }
5688     if (!osdmap.exists(osd)) {
5689       ss << "osd." << osd << " does not exist";
5690       r = -ENOENT;
5691       goto reply;
5692     }
5693     string format;
5694     cmd_getval(cmdmap, "format", format);
5695     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5696     f->open_object_section("osd_location");
5697     f->dump_int("osd", osd);
5698     f->dump_object("addrs", osdmap.get_addrs(osd));
5699     f->dump_stream("osd_fsid") << osdmap.get_uuid(osd);
5700
5701     // try to identify host, pod/container name, etc.
5702     map<string,string> m;
5703     load_metadata(osd, m, nullptr);
5704     if (auto p = m.find("hostname"); p != m.end()) {
5705       f->dump_string("host", p->second);
5706     }
5707     for (auto& k : {
5708         "pod_name", "pod_namespace", // set by rook
5709         "container_name"             // set by cephadm, ceph-ansible
5710         }) {
5711       if (auto p = m.find(k); p != m.end()) {
5712         f->dump_string(k, p->second);
5713       }
5714     }
5715
5716     // crush is helpful too
5717     f->open_object_section("crush_location");
5718     map<string,string> loc = osdmap.crush->get_full_location(osd);
5719     for (map<string,string>::iterator p = loc.begin(); p != loc.end(); ++p)
5720       f->dump_string(p->first.c_str(), p->second);
5721     f->close_section();
5722     f->close_section();
5723     f->flush(rdata);
5724   } else if (prefix == "osd metadata") {
5725     int64_t osd = -1;
5726     if (cmd_vartype_stringify(cmdmap["id"]).size() &&
5727         !cmd_getval(cmdmap, "id", osd)) {
5728       ss << "unable to parse osd id value '"
5729          << cmd_vartype_stringify(cmdmap["id"]) << "'";
5730       r = -EINVAL;
5731       goto reply;
5732     }
5733     if (osd >= 0 && !osdmap.exists(osd)) {
5734       ss << "osd." << osd << " does not exist";
5735       r = -ENOENT;
5736       goto reply;
5737     }
5738     string format;
5739     cmd_getval(cmdmap, "format", format);
5740     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
5741     if (osd >= 0) {
5742       f->open_object_section("osd_metadata");
5743       f->dump_unsigned("id", osd);
5744       r = dump_osd_metadata(osd, f.get(), &ss);
5745       if (r < 0)
5746         goto reply;
5747       f->close_section();
5748     } else {
5749       r = 0;
5750       f->open_array_section("osd_metadata");
5751       for (int i=0; i<osdmap.get_max_osd(); ++i) {
5752         if (osdmap.exists(i)) {
5753           f->open_object_section("osd");
5754           f->dump_unsigned("id", i);
5755           r = dump_osd_metadata(i, f.get(), NULL);
5756           if (r == -EINVAL || r == -ENOENT) {
5757             // Drop error, continue to get other daemons' metadata
5758             dout(4) << "No metadata for osd." << i << dendl;
5759             r = 0;
5760           } else if (r < 0) {
5761             // Unexpected error
5762             goto reply;
5763           }
5764           f->close_section();
5765         }
5766       }
5767       f->close_section();
5768     }
5769     f->flush(rdata);
5770   } else if (prefix == "osd versions") {
5771     if (!f)
5772       f.reset(Formatter::create("json-pretty"));
5773     count_metadata("ceph_version", f.get());
5774     f->flush(rdata);
5775     r = 0;
5776   } else if (prefix == "osd count-metadata") {
5777     if (!f)
5778       f.reset(Formatter::create("json-pretty"));
5779     string field;
5780     cmd_getval(cmdmap, "property", field);
5781     count_metadata(field, f.get());
5782     f->flush(rdata);
5783     r = 0;
5784   } else if (prefix == "osd numa-status") {
5785     TextTable tbl;
5786     if (f) {
5787       f->open_array_section("osds");
5788     } else {
5789       tbl.define_column("OSD", TextTable::LEFT, TextTable::RIGHT);
5790       tbl.define_column("HOST", TextTable::LEFT, TextTable::LEFT);
5791       tbl.define_column("NETWORK", TextTable::RIGHT, TextTable::RIGHT);
5792       tbl.define_column("STORAGE", TextTable::RIGHT, TextTable::RIGHT);
5793       tbl.define_column("AFFINITY", TextTable::RIGHT, TextTable::RIGHT);
5794       tbl.define_column("CPUS", TextTable::LEFT, TextTable::LEFT);
5795     }
5796     for (int i=0; i<osdmap.get_max_osd(); ++i) {
5797       if (osdmap.exists(i)) {
5798         map<string,string> m;
5799         ostringstream err;
5800         if (load_metadata(i, m, &err) < 0) {
5801           continue;
5802         }
5803         string host;
5804         auto p = m.find("hostname");
5805         if (p != m.end()) {
5806           host = p->second;
5807         }
5808         if (f) {
5809           f->open_object_section("osd");
5810           f->dump_int("osd", i);
5811           f->dump_string("host", host);
5812           for (auto n : { "network_numa_node", "objectstore_numa_node",
5813                 "numa_node" }) {
5814             p = m.find(n);
5815             if (p != m.end()) {
5816               f->dump_int(n, atoi(p->second.c_str()));
5817             }
5818           }
5819           for (auto n : { "network_numa_nodes", "objectstore_numa_nodes" }) {
5820             p = m.find(n);
5821             if (p != m.end()) {
5822               list<string> ls = get_str_list(p->second, ",");
5823               f->open_array_section(n);
5824               for (auto node : ls) {
5825                 f->dump_int("node", atoi(node.c_str()));
5826               }
5827               f->close_section();
5828             }
5829           }
5830           for (auto n : { "numa_node_cpus" }) {
5831             p = m.find(n);
5832             if (p != m.end()) {
5833               dump_cpu_list(f.get(), n, p->second);
5834             }
5835           }
5836           f->close_section();
5837         } else {
5838           tbl << i;
5839           tbl << host;
5840           p = m.find("network_numa_nodes");
5841           if (p != m.end()) {
5842             tbl << p->second;
5843           } else {
5844             tbl << "-";
5845           }
5846           p = m.find("objectstore_numa_nodes");
5847           if (p != m.end()) {
5848             tbl << p->second;
5849           } else {
5850             tbl << "-";
5851           }
5852           p = m.find("numa_node");
5853           auto q = m.find("numa_node_cpus");
5854           if (p != m.end() && q != m.end()) {
5855             tbl << p->second;
5856             tbl << q->second;
5857           } else {
5858             tbl << "-";
5859             tbl << "-";
5860           }
5861           tbl << TextTable::endrow;
5862         }
5863       }
5864     }
5865     if (f) {
5866       f->close_section();
5867       f->flush(rdata);
5868     } else {
5869       rdata.append(stringify(tbl));
5870     }
5871   } else if (prefix == "osd map") {
5872     string poolstr, objstr, namespacestr;
5873     cmd_getval(cmdmap, "pool", poolstr);
5874     cmd_getval(cmdmap, "object", objstr);
5875     cmd_getval(cmdmap, "nspace", namespacestr);
5876
5877     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
5878     if (pool < 0) {
5879       ss << "pool " << poolstr << " does not exist";
5880       r = -ENOENT;
5881       goto reply;
5882     }
5883     object_locator_t oloc(pool, namespacestr);
5884     object_t oid(objstr);
5885     pg_t pgid = osdmap.object_locator_to_pg(oid, oloc);
5886     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5887     vector<int> up, acting;
5888     int up_p, acting_p;
5889     osdmap.pg_to_up_acting_osds(mpgid, &up, &up_p, &acting, &acting_p);
5890
5891     string fullobjname;
5892     if (!namespacestr.empty())
5893       fullobjname = namespacestr + string("/") + oid.name;
5894     else
5895       fullobjname = oid.name;
5896     if (f) {
5897       f->open_object_section("osd_map");
5898       f->dump_unsigned("epoch", osdmap.get_epoch());
5899       f->dump_string("pool", poolstr);
5900       f->dump_int("pool_id", pool);
5901       f->dump_stream("objname") << fullobjname;
5902       f->dump_stream("raw_pgid") << pgid;
5903       f->dump_stream("pgid") << mpgid;
5904       f->open_array_section("up");
5905       for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
5906         f->dump_int("osd", *p);
5907       f->close_section();
5908       f->dump_int("up_primary", up_p);
5909       f->open_array_section("acting");
5910       for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
5911         f->dump_int("osd", *p);
5912       f->close_section();
5913       f->dump_int("acting_primary", acting_p);
5914       f->close_section(); // osd_map
5915       f->flush(rdata);
5916     } else {
5917       ds << "osdmap e" << osdmap.get_epoch()
5918         << " pool '" << poolstr << "' (" << pool << ")"
5919         << " object '" << fullobjname << "' ->"
5920         << " pg " << pgid << " (" << mpgid << ")"
5921         << " -> up (" << pg_vector_string(up) << ", p" << up_p << ") acting ("
5922         << pg_vector_string(acting) << ", p" << acting_p << ")";
5923       rdata.append(ds);
5924     }
5925
5926   } else if (prefix == "pg map") {
5927     pg_t pgid;
5928     string pgidstr;
5929     cmd_getval(cmdmap, "pgid", pgidstr);
5930     if (!pgid.parse(pgidstr.c_str())) {
5931       ss << "invalid pgid '" << pgidstr << "'";
5932       r = -EINVAL;
5933       goto reply;
5934     }
5935     vector<int> up, acting;
5936     if (!osdmap.have_pg_pool(pgid.pool())) {
5937       ss << "pg '" << pgidstr << "' does not exist";
5938       r = -ENOENT;
5939       goto reply;
5940     }
5941     pg_t mpgid = osdmap.raw_pg_to_pg(pgid);
5942     osdmap.pg_to_up_acting_osds(pgid, up, acting);
5943     if (f) {
5944       f->open_object_section("pg_map");
5945       f->dump_unsigned("epoch", osdmap.get_epoch());
5946       f->dump_stream("raw_pgid") << pgid;
5947       f->dump_stream("pgid") << mpgid;
5948       f->open_array_section("up");
5949       for (auto osd : up) {
5950         f->dump_int("up_osd", osd);
5951       }
5952       f->close_section();
5953       f->open_array_section("acting");
5954       for (auto osd : acting) {
5955         f->dump_int("acting_osd", osd);
5956       }
5957       f->close_section();
5958       f->close_section();
5959       f->flush(rdata);
5960     } else {
5961       ds << "osdmap e" << osdmap.get_epoch()
5962          << " pg " << pgid << " (" << mpgid << ")"
5963          << " -> up " << up << " acting " << acting;
5964       rdata.append(ds);
5965     }
5966     goto reply;
5967
5968   } else if (prefix == "osd lspools") {
5969     if (f)
5970       f->open_array_section("pools");
5971     for (map<int64_t, pg_pool_t>::iterator p = osdmap.pools.begin();
5972          p != osdmap.pools.end();
5973          ++p) {
5974       if (f) {
5975         f->open_object_section("pool");
5976         f->dump_int("poolnum", p->first);
5977         f->dump_string("poolname", osdmap.pool_name[p->first]);
5978         f->close_section();
5979       } else {
5980         ds << p->first << ' ' << osdmap.pool_name[p->first];
5981         if (next(p) != osdmap.pools.end()) {
5982           ds << '\n';
5983         }
5984       }
5985     }
5986     if (f) {
5987       f->close_section();
5988       f->flush(ds);
5989     }
5990     rdata.append(ds);
5991   } else if (prefix == "osd blocklist ls" ||
5992              prefix == "osd blacklist ls") {
5993     if (f)
5994       f->open_array_section("blocklist");
5995
5996     for (ceph::unordered_map<entity_addr_t,utime_t>::iterator p = osdmap.blocklist.begin();
5997          p != osdmap.blocklist.end();
5998          ++p) {
5999       if (f) {
6000         f->open_object_section("entry");
6001         f->dump_string("addr", p->first.get_legacy_str());
6002         f->dump_stream("until") << p->second;
6003         f->close_section();
6004       } else {
6005         stringstream ss;
6006         string s;
6007         ss << p->first << " " << p->second;
6008         getline(ss, s);
6009         s += "\n";
6010         rdata.append(s);
6011       }
6012     }
6013     if (f) {
6014       f->close_section();
6015       f->flush(rdata);
6016     }
6017     ss << "listed " << osdmap.blocklist.size() << " entries";
6018
6019   } else if (prefix == "osd pool ls") {
6020     string detail;
6021     cmd_getval(cmdmap, "detail", detail);
6022     if (!f && detail == "detail") {
6023       ostringstream ss;
6024       osdmap.print_pools(ss);
6025       rdata.append(ss.str());
6026     } else {
6027       if (f)
6028         f->open_array_section("pools");
6029       for (map<int64_t,pg_pool_t>::const_iterator it = osdmap.get_pools().begin();
6030            it != osdmap.get_pools().end();
6031            ++it) {
6032         if (f) {
6033           if (detail == "detail") {
6034             f->open_object_section("pool");
6035             f->dump_int("pool_id", it->first);
6036             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6037             it->second.dump(f.get());
6038             f->close_section();
6039           } else {
6040             f->dump_string("pool_name", osdmap.get_pool_name(it->first));
6041           }
6042         } else {
6043           rdata.append(osdmap.get_pool_name(it->first) + "\n");
6044         }
6045       }
6046       if (f) {
6047         f->close_section();
6048         f->flush(rdata);
6049       }
6050     }
6051
6052   } else if (prefix == "osd crush get-tunable") {
6053     string tunable;
6054     cmd_getval(cmdmap, "tunable", tunable);
6055     ostringstream rss;
6056     if (f)
6057       f->open_object_section("tunable");
6058     if (tunable == "straw_calc_version") {
6059       if (f)
6060         f->dump_int(tunable.c_str(), osdmap.crush->get_straw_calc_version());
6061       else
6062         rss << osdmap.crush->get_straw_calc_version() << "\n";
6063     } else {
6064       r = -EINVAL;
6065       goto reply;
6066     }
6067     if (f) {
6068       f->close_section();
6069       f->flush(rdata);
6070     } else {
6071       rdata.append(rss.str());
6072     }
6073     r = 0;
6074
6075   } else if (prefix == "osd pool get") {
6076     string poolstr;
6077     cmd_getval(cmdmap, "pool", poolstr);
6078     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
6079     if (pool < 0) {
6080       ss << "unrecognized pool '" << poolstr << "'";
6081       r = -ENOENT;
6082       goto reply;
6083     }
6084
6085     const pg_pool_t *p = osdmap.get_pg_pool(pool);
6086     string var;
6087     cmd_getval(cmdmap, "var", var);
6088
6089     typedef std::map<std::string, osd_pool_get_choices> choices_map_t;
6090     const choices_map_t ALL_CHOICES = {
6091       {"size", SIZE},
6092       {"min_size", MIN_SIZE},
6093       {"pg_num", PG_NUM}, {"pgp_num", PGP_NUM},
6094       {"crush_rule", CRUSH_RULE}, {"hashpspool", HASHPSPOOL},
6095       {"allow_ec_overwrites", EC_OVERWRITES}, {"nodelete", NODELETE},
6096       {"nopgchange", NOPGCHANGE}, {"nosizechange", NOSIZECHANGE},
6097       {"noscrub", NOSCRUB}, {"nodeep-scrub", NODEEP_SCRUB},
6098       {"write_fadvise_dontneed", WRITE_FADVISE_DONTNEED},
6099       {"hit_set_type", HIT_SET_TYPE}, {"hit_set_period", HIT_SET_PERIOD},
6100       {"hit_set_count", HIT_SET_COUNT}, {"hit_set_fpp", HIT_SET_FPP},
6101       {"use_gmt_hitset", USE_GMT_HITSET},
6102       {"target_max_objects", TARGET_MAX_OBJECTS},
6103       {"target_max_bytes", TARGET_MAX_BYTES},
6104       {"cache_target_dirty_ratio", CACHE_TARGET_DIRTY_RATIO},
6105       {"cache_target_dirty_high_ratio", CACHE_TARGET_DIRTY_HIGH_RATIO},
6106       {"cache_target_full_ratio", CACHE_TARGET_FULL_RATIO},
6107       {"cache_min_flush_age", CACHE_MIN_FLUSH_AGE},
6108       {"cache_min_evict_age", CACHE_MIN_EVICT_AGE},
6109       {"erasure_code_profile", ERASURE_CODE_PROFILE},
6110       {"min_read_recency_for_promote", MIN_READ_RECENCY_FOR_PROMOTE},
6111       {"min_write_recency_for_promote", MIN_WRITE_RECENCY_FOR_PROMOTE},
6112       {"fast_read", FAST_READ},
6113       {"hit_set_grade_decay_rate", HIT_SET_GRADE_DECAY_RATE},
6114       {"hit_set_search_last_n", HIT_SET_SEARCH_LAST_N},
6115       {"scrub_min_interval", SCRUB_MIN_INTERVAL},
6116       {"scrub_max_interval", SCRUB_MAX_INTERVAL},
6117       {"deep_scrub_interval", DEEP_SCRUB_INTERVAL},
6118       {"recovery_priority", RECOVERY_PRIORITY},
6119       {"recovery_op_priority", RECOVERY_OP_PRIORITY},
6120       {"scrub_priority", SCRUB_PRIORITY},
6121       {"compression_mode", COMPRESSION_MODE},
6122       {"compression_algorithm", COMPRESSION_ALGORITHM},
6123       {"compression_required_ratio", COMPRESSION_REQUIRED_RATIO},
6124       {"compression_max_blob_size", COMPRESSION_MAX_BLOB_SIZE},
6125       {"compression_min_blob_size", COMPRESSION_MIN_BLOB_SIZE},
6126       {"csum_type", CSUM_TYPE},
6127       {"csum_max_block", CSUM_MAX_BLOCK},
6128       {"csum_min_block", CSUM_MIN_BLOCK},
6129       {"fingerprint_algorithm", FINGERPRINT_ALGORITHM},
6130       {"pg_autoscale_mode", PG_AUTOSCALE_MODE},
6131       {"pg_num_min", PG_NUM_MIN},
6132       {"target_size_bytes", TARGET_SIZE_BYTES},
6133       {"target_size_ratio", TARGET_SIZE_RATIO},
6134       {"pg_autoscale_bias", PG_AUTOSCALE_BIAS},
6135       {"dedup_tier", DEDUP_TIER},
6136       {"dedup_chunk_algorithm", DEDUP_CHUNK_ALGORITHM},
6137       {"dedup_cdc_chunk_size", DEDUP_CDC_CHUNK_SIZE},
6138     };
6139
6140     typedef std::set<osd_pool_get_choices> choices_set_t;
6141
6142     const choices_set_t ONLY_TIER_CHOICES = {
6143       HIT_SET_TYPE, HIT_SET_PERIOD, HIT_SET_COUNT, HIT_SET_FPP,
6144       TARGET_MAX_OBJECTS, TARGET_MAX_BYTES, CACHE_TARGET_FULL_RATIO,
6145       CACHE_TARGET_DIRTY_RATIO, CACHE_TARGET_DIRTY_HIGH_RATIO,
6146       CACHE_MIN_FLUSH_AGE, CACHE_MIN_EVICT_AGE,
6147       MIN_READ_RECENCY_FOR_PROMOTE,
6148       MIN_WRITE_RECENCY_FOR_PROMOTE,
6149       HIT_SET_GRADE_DECAY_RATE, HIT_SET_SEARCH_LAST_N
6150     };
6151     const choices_set_t ONLY_ERASURE_CHOICES = {
6152       EC_OVERWRITES, ERASURE_CODE_PROFILE
6153     };
6154
6155     choices_set_t selected_choices;
6156     if (var == "all") {
6157       for(choices_map_t::const_iterator it = ALL_CHOICES.begin();
6158           it != ALL_CHOICES.end(); ++it) {
6159         selected_choices.insert(it->second);
6160       }
6161
6162       if(!p->is_tier()) {
6163         selected_choices = subtract_second_from_first(selected_choices,
6164                                                       ONLY_TIER_CHOICES);
6165       }
6166
6167       if(!p->is_erasure()) {
6168         selected_choices = subtract_second_from_first(selected_choices,
6169                                                       ONLY_ERASURE_CHOICES);
6170       }
6171     } else /* var != "all" */  {
6172       choices_map_t::const_iterator found = ALL_CHOICES.find(var);
6173       osd_pool_get_choices selected = found->second;
6174
6175       if (!p->is_tier() &&
6176           ONLY_TIER_CHOICES.find(selected) != ONLY_TIER_CHOICES.end()) {
6177         ss << "pool '" << poolstr
6178            << "' is not a tier pool: variable not applicable";
6179         r = -EACCES;
6180         goto reply;
6181       }
6182
6183       if (!p->is_erasure() &&
6184           ONLY_ERASURE_CHOICES.find(selected)
6185           != ONLY_ERASURE_CHOICES.end()) {
6186         ss << "pool '" << poolstr
6187            << "' is not a erasure pool: variable not applicable";
6188         r = -EACCES;
6189         goto reply;
6190       }
6191
6192       if (pool_opts_t::is_opt_name(var) &&
6193           !p->opts.is_set(pool_opts_t::get_opt_desc(var).key)) {
6194         ss << "option '" << var << "' is not set on pool '" << poolstr << "'";
6195         r = -ENOENT;
6196         goto reply;
6197       }
6198
6199       selected_choices.insert(selected);
6200     }
6201
6202     if (f) {
6203       f->open_object_section("pool");
6204       f->dump_string("pool", poolstr);
6205       f->dump_int("pool_id", pool);
6206       for(choices_set_t::const_iterator it = selected_choices.begin();
6207           it != selected_choices.end(); ++it) {
6208         choices_map_t::const_iterator i;
6209         for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6210           if (i->second == *it) {
6211             break;
6212           }
6213         }
6214         ceph_assert(i != ALL_CHOICES.end());
6215         switch(*it) {
6216           case PG_NUM:
6217             f->dump_int("pg_num", p->get_pg_num());
6218             break;
6219           case PGP_NUM:
6220             f->dump_int("pgp_num", p->get_pgp_num());
6221             break;
6222           case SIZE:
6223             f->dump_int("size", p->get_size());
6224             break;
6225           case MIN_SIZE:
6226             f->dump_int("min_size", p->get_min_size());
6227             break;
6228           case CRUSH_RULE:
6229             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6230               f->dump_string("crush_rule", osdmap.crush->get_rule_name(
6231                                p->get_crush_rule()));
6232             } else {
6233               f->dump_string("crush_rule", stringify(p->get_crush_rule()));
6234             }
6235             break;
6236           case EC_OVERWRITES:
6237             f->dump_bool("allow_ec_overwrites",
6238                          p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES));
6239             break;
6240           case PG_AUTOSCALE_MODE:
6241             f->dump_string("pg_autoscale_mode",
6242                            pg_pool_t::get_pg_autoscale_mode_name(
6243                              p->pg_autoscale_mode));
6244             break;
6245           case HASHPSPOOL:
6246           case NODELETE:
6247           case NOPGCHANGE:
6248           case NOSIZECHANGE:
6249           case WRITE_FADVISE_DONTNEED:
6250           case NOSCRUB:
6251           case NODEEP_SCRUB:
6252             f->dump_bool(i->first.c_str(),
6253                            p->has_flag(pg_pool_t::get_flag_by_name(i->first)));
6254             break;
6255           case HIT_SET_PERIOD:
6256             f->dump_int("hit_set_period", p->hit_set_period);
6257             break;
6258           case HIT_SET_COUNT:
6259             f->dump_int("hit_set_count", p->hit_set_count);
6260             break;
6261           case HIT_SET_TYPE:
6262             f->dump_string("hit_set_type",
6263                            HitSet::get_type_name(p->hit_set_params.get_type()));
6264             break;
6265           case HIT_SET_FPP:
6266             {
6267               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6268                 BloomHitSet::Params *bloomp =
6269                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6270                 f->dump_float("hit_set_fpp", bloomp->get_fpp());
6271               } else if(var != "all") {
6272                 f->close_section();
6273                 ss << "hit set is not of type Bloom; " <<
6274                   "invalid to get a false positive rate!";
6275                 r = -EINVAL;
6276                 goto reply;
6277               }
6278             }
6279             break;
6280           case USE_GMT_HITSET:
6281             f->dump_bool("use_gmt_hitset", p->use_gmt_hitset);
6282             break;
6283           case TARGET_MAX_OBJECTS:
6284             f->dump_unsigned("target_max_objects", p->target_max_objects);
6285             break;
6286           case TARGET_MAX_BYTES:
6287             f->dump_unsigned("target_max_bytes", p->target_max_bytes);
6288             break;
6289           case CACHE_TARGET_DIRTY_RATIO:
6290             f->dump_unsigned("cache_target_dirty_ratio_micro",
6291                              p->cache_target_dirty_ratio_micro);
6292             f->dump_float("cache_target_dirty_ratio",
6293                           ((float)p->cache_target_dirty_ratio_micro/1000000));
6294             break;
6295           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6296             f->dump_unsigned("cache_target_dirty_high_ratio_micro",
6297                              p->cache_target_dirty_high_ratio_micro);
6298             f->dump_float("cache_target_dirty_high_ratio",
6299                           ((float)p->cache_target_dirty_high_ratio_micro/1000000));
6300             break;
6301           case CACHE_TARGET_FULL_RATIO:
6302             f->dump_unsigned("cache_target_full_ratio_micro",
6303                              p->cache_target_full_ratio_micro);
6304             f->dump_float("cache_target_full_ratio",
6305                           ((float)p->cache_target_full_ratio_micro/1000000));
6306             break;
6307           case CACHE_MIN_FLUSH_AGE:
6308             f->dump_unsigned("cache_min_flush_age", p->cache_min_flush_age);
6309             break;
6310           case CACHE_MIN_EVICT_AGE:
6311             f->dump_unsigned("cache_min_evict_age", p->cache_min_evict_age);
6312             break;
6313           case ERASURE_CODE_PROFILE:
6314             f->dump_string("erasure_code_profile", p->erasure_code_profile);
6315             break;
6316           case MIN_READ_RECENCY_FOR_PROMOTE:
6317             f->dump_int("min_read_recency_for_promote",
6318                         p->min_read_recency_for_promote);
6319             break;
6320           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6321             f->dump_int("min_write_recency_for_promote",
6322                         p->min_write_recency_for_promote);
6323             break;
6324           case FAST_READ:
6325             f->dump_int("fast_read", p->fast_read);
6326             break;
6327           case HIT_SET_GRADE_DECAY_RATE:
6328             f->dump_int("hit_set_grade_decay_rate",
6329                         p->hit_set_grade_decay_rate);
6330             break;
6331           case HIT_SET_SEARCH_LAST_N:
6332             f->dump_int("hit_set_search_last_n",
6333                         p->hit_set_search_last_n);
6334             break;
6335           case SCRUB_MIN_INTERVAL:
6336           case SCRUB_MAX_INTERVAL:
6337           case DEEP_SCRUB_INTERVAL:
6338           case RECOVERY_PRIORITY:
6339           case RECOVERY_OP_PRIORITY:
6340           case SCRUB_PRIORITY:
6341           case COMPRESSION_MODE:
6342           case COMPRESSION_ALGORITHM:
6343           case COMPRESSION_REQUIRED_RATIO:
6344           case COMPRESSION_MAX_BLOB_SIZE:
6345           case COMPRESSION_MIN_BLOB_SIZE:
6346           case CSUM_TYPE:
6347           case CSUM_MAX_BLOCK:
6348           case CSUM_MIN_BLOCK:
6349           case FINGERPRINT_ALGORITHM:
6350           case PG_NUM_MIN:
6351           case TARGET_SIZE_BYTES:
6352           case TARGET_SIZE_RATIO:
6353           case PG_AUTOSCALE_BIAS:
6354           case DEDUP_TIER:
6355           case DEDUP_CHUNK_ALGORITHM:
6356           case DEDUP_CDC_CHUNK_SIZE:
6357             pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6358             if (p->opts.is_set(key)) {
6359               if(*it == CSUM_TYPE) {
6360                 int64_t val;
6361                 p->opts.get(pool_opts_t::CSUM_TYPE, &val);
6362                 f->dump_string(i->first.c_str(), Checksummer::get_csum_type_string(val));
6363               } else {
6364                 p->opts.dump(i->first, f.get());
6365               }
6366             }
6367             break;
6368         }
6369       }
6370       f->close_section();
6371       f->flush(rdata);
6372     } else /* !f */ {
6373       for(choices_set_t::const_iterator it = selected_choices.begin();
6374           it != selected_choices.end(); ++it) {
6375         choices_map_t::const_iterator i;
6376         switch(*it) {
6377           case PG_NUM:
6378             ss << "pg_num: " << p->get_pg_num() << "\n";
6379             break;
6380           case PGP_NUM:
6381             ss << "pgp_num: " << p->get_pgp_num() << "\n";
6382             break;
6383           case SIZE:
6384             ss << "size: " << p->get_size() << "\n";
6385             break;
6386           case MIN_SIZE:
6387             ss << "min_size: " << p->get_min_size() << "\n";
6388             break;
6389           case CRUSH_RULE:
6390             if (osdmap.crush->rule_exists(p->get_crush_rule())) {
6391               ss << "crush_rule: " << osdmap.crush->get_rule_name(
6392                 p->get_crush_rule()) << "\n";
6393             } else {
6394               ss << "crush_rule: " << p->get_crush_rule() << "\n";
6395             }
6396             break;
6397           case PG_AUTOSCALE_MODE:
6398             ss << "pg_autoscale_mode: " << pg_pool_t::get_pg_autoscale_mode_name(
6399               p->pg_autoscale_mode) <<"\n";
6400             break;
6401           case HIT_SET_PERIOD:
6402             ss << "hit_set_period: " << p->hit_set_period << "\n";
6403             break;
6404           case HIT_SET_COUNT:
6405             ss << "hit_set_count: " << p->hit_set_count << "\n";
6406             break;
6407           case HIT_SET_TYPE:
6408             ss << "hit_set_type: " <<
6409               HitSet::get_type_name(p->hit_set_params.get_type()) << "\n";
6410             break;
6411           case HIT_SET_FPP:
6412             {
6413               if (p->hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
6414                 BloomHitSet::Params *bloomp =
6415                   static_cast<BloomHitSet::Params*>(p->hit_set_params.impl.get());
6416                 ss << "hit_set_fpp: " << bloomp->get_fpp() << "\n";
6417               } else if(var != "all") {
6418                 ss << "hit set is not of type Bloom; " <<
6419                   "invalid to get a false positive rate!";
6420                 r = -EINVAL;
6421                 goto reply;
6422               }
6423             }
6424             break;
6425           case USE_GMT_HITSET:
6426             ss << "use_gmt_hitset: " << p->use_gmt_hitset << "\n";
6427             break;
6428           case TARGET_MAX_OBJECTS:
6429             ss << "target_max_objects: " << p->target_max_objects << "\n";
6430             break;
6431           case TARGET_MAX_BYTES:
6432             ss << "target_max_bytes: " << p->target_max_bytes << "\n";
6433             break;
6434           case CACHE_TARGET_DIRTY_RATIO:
6435             ss << "cache_target_dirty_ratio: "
6436                << ((float)p->cache_target_dirty_ratio_micro/1000000) << "\n";
6437             break;
6438           case CACHE_TARGET_DIRTY_HIGH_RATIO:
6439             ss << "cache_target_dirty_high_ratio: "
6440                << ((float)p->cache_target_dirty_high_ratio_micro/1000000) << "\n";
6441             break;
6442           case CACHE_TARGET_FULL_RATIO:
6443             ss << "cache_target_full_ratio: "
6444                << ((float)p->cache_target_full_ratio_micro/1000000) << "\n";
6445             break;
6446           case CACHE_MIN_FLUSH_AGE:
6447             ss << "cache_min_flush_age: " << p->cache_min_flush_age << "\n";
6448             break;
6449           case CACHE_MIN_EVICT_AGE:
6450             ss << "cache_min_evict_age: " << p->cache_min_evict_age << "\n";
6451             break;
6452           case ERASURE_CODE_PROFILE:
6453             ss << "erasure_code_profile: " << p->erasure_code_profile << "\n";
6454             break;
6455           case MIN_READ_RECENCY_FOR_PROMOTE:
6456             ss << "min_read_recency_for_promote: " <<
6457               p->min_read_recency_for_promote << "\n";
6458             break;
6459           case HIT_SET_GRADE_DECAY_RATE:
6460             ss << "hit_set_grade_decay_rate: " <<
6461               p->hit_set_grade_decay_rate << "\n";
6462             break;
6463           case HIT_SET_SEARCH_LAST_N:
6464             ss << "hit_set_search_last_n: " <<
6465               p->hit_set_search_last_n << "\n";
6466             break;
6467           case EC_OVERWRITES:
6468             ss << "allow_ec_overwrites: " <<
6469               (p->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) ? "true" : "false") <<
6470               "\n";
6471             break;
6472           case HASHPSPOOL:
6473           case NODELETE:
6474           case NOPGCHANGE:
6475           case NOSIZECHANGE:
6476           case WRITE_FADVISE_DONTNEED:
6477           case NOSCRUB:
6478           case NODEEP_SCRUB:
6479             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6480               if (i->second == *it)
6481                 break;
6482             }
6483             ceph_assert(i != ALL_CHOICES.end());
6484             ss << i->first << ": " <<
6485               (p->has_flag(pg_pool_t::get_flag_by_name(i->first)) ?
6486                "true" : "false") << "\n";
6487             break;
6488           case MIN_WRITE_RECENCY_FOR_PROMOTE:
6489             ss << "min_write_recency_for_promote: " <<
6490               p->min_write_recency_for_promote << "\n";
6491             break;
6492           case FAST_READ:
6493             ss << "fast_read: " << p->fast_read << "\n";
6494             break;
6495           case SCRUB_MIN_INTERVAL:
6496           case SCRUB_MAX_INTERVAL:
6497           case DEEP_SCRUB_INTERVAL:
6498           case RECOVERY_PRIORITY:
6499           case RECOVERY_OP_PRIORITY:
6500           case SCRUB_PRIORITY:
6501           case COMPRESSION_MODE:
6502           case COMPRESSION_ALGORITHM:
6503           case COMPRESSION_REQUIRED_RATIO:
6504           case COMPRESSION_MAX_BLOB_SIZE:
6505           case COMPRESSION_MIN_BLOB_SIZE:
6506           case CSUM_TYPE:
6507           case CSUM_MAX_BLOCK:
6508           case CSUM_MIN_BLOCK:
6509           case FINGERPRINT_ALGORITHM:
6510           case PG_NUM_MIN:
6511           case TARGET_SIZE_BYTES:
6512           case TARGET_SIZE_RATIO:
6513           case PG_AUTOSCALE_BIAS:
6514           case DEDUP_TIER:
6515           case DEDUP_CHUNK_ALGORITHM:
6516           case DEDUP_CDC_CHUNK_SIZE:
6517             for (i = ALL_CHOICES.begin(); i != ALL_CHOICES.end(); ++i) {
6518               if (i->second == *it)
6519                 break;
6520             }
6521             ceph_assert(i != ALL_CHOICES.end());
6522             {
6523               pool_opts_t::key_t key = pool_opts_t::get_opt_desc(i->first).key;
6524               if (p->opts.is_set(key)) {
6525                 if(key == pool_opts_t::CSUM_TYPE) {
6526                   int64_t val;
6527                   p->opts.get(key, &val);
6528                   ss << i->first << ": " << Checksummer::get_csum_type_string(val) << "\n";
6529                 } else {
6530                   ss << i->first << ": " << p->opts.get(key) << "\n";
6531                 }
6532               }
6533             }
6534             break;
6535         }
6536         rdata.append(ss.str());
6537         ss.str("");
6538       }
6539     }
6540     r = 0;
6541   } else if (prefix == "osd pool get-quota") {
6542     string pool_name;
6543     cmd_getval(cmdmap, "pool", pool_name);
6544
6545     int64_t poolid = osdmap.lookup_pg_pool_name(pool_name);
6546     if (poolid < 0) {
6547       ceph_assert(poolid == -ENOENT);
6548       ss << "unrecognized pool '" << pool_name << "'";
6549       r = -ENOENT;
6550       goto reply;
6551     }
6552     const pg_pool_t *p = osdmap.get_pg_pool(poolid);
6553     const pool_stat_t* pstat = mon.mgrstatmon()->get_pool_stat(poolid);
6554     const object_stat_sum_t& sum = pstat->stats.sum;
6555     if (f) {
6556       f->open_object_section("pool_quotas");
6557       f->dump_string("pool_name", pool_name);
6558       f->dump_unsigned("pool_id", poolid);
6559       f->dump_unsigned("quota_max_objects", p->quota_max_objects);
6560       f->dump_int("current_num_objects", sum.num_objects);
6561       f->dump_unsigned("quota_max_bytes", p->quota_max_bytes);
6562       f->dump_int("current_num_bytes", sum.num_bytes);
6563       f->close_section();
6564       f->flush(rdata);
6565     } else {
6566       stringstream rs;
6567       rs << "quotas for pool '" << pool_name << "':\n"
6568          << "  max objects: ";
6569       if (p->quota_max_objects == 0)
6570         rs << "N/A";
6571       else {
6572         rs << si_u_t(p->quota_max_objects) << " objects";
6573         rs << "  (current num objects: " << sum.num_objects << " objects)";
6574       }
6575       rs << "\n"
6576          << "  max bytes  : ";
6577       if (p->quota_max_bytes == 0)
6578         rs << "N/A";
6579       else {
6580         rs << byte_u_t(p->quota_max_bytes);
6581         rs << "  (current num bytes: " << sum.num_bytes << " bytes)";
6582       }
6583       rdata.append(rs.str());
6584     }
6585     rdata.append("\n");
6586     r = 0;
6587   } else if (prefix == "osd crush rule list" ||
6588              prefix == "osd crush rule ls") {
6589     if (f) {
6590       f->open_array_section("rules");
6591       osdmap.crush->list_rules(f.get());
6592       f->close_section();
6593       f->flush(rdata);
6594     } else {
6595       ostringstream ss;
6596       osdmap.crush->list_rules(&ss);
6597       rdata.append(ss.str());
6598     }
6599   } else if (prefix == "osd crush rule ls-by-class") {
6600     string class_name;
6601     cmd_getval(cmdmap, "class", class_name);
6602     if (class_name.empty()) {
6603       ss << "no class specified";
6604       r = -EINVAL;
6605       goto reply;
6606     }
6607     set<int> rules;
6608     r = osdmap.crush->get_rules_by_class(class_name, &rules);
6609     if (r < 0) {
6610       ss << "failed to get rules by class '" << class_name << "'";
6611       goto reply;
6612     }
6613     if (f) {
6614       f->open_array_section("rules");
6615       for (auto &rule: rules) {
6616         f->dump_string("name", osdmap.crush->get_rule_name(rule));
6617       }
6618       f->close_section();
6619       f->flush(rdata);
6620     } else {
6621       ostringstream rs;
6622       for (auto &rule: rules) {
6623         rs << osdmap.crush->get_rule_name(rule) << "\n";
6624       }
6625       rdata.append(rs.str());
6626     }
6627   } else if (prefix == "osd crush rule dump") {
6628     string name;
6629     cmd_getval(cmdmap, "name", name);
6630     string format;
6631     cmd_getval(cmdmap, "format", format);
6632     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6633     if (name == "") {
6634       f->open_array_section("rules");
6635       osdmap.crush->dump_rules(f.get());
6636       f->close_section();
6637     } else {
6638       int ruleno = osdmap.crush->get_rule_id(name);
6639       if (ruleno < 0) {
6640         ss << "unknown crush rule '" << name << "'";
6641         r = ruleno;
6642         goto reply;
6643       }
6644       osdmap.crush->dump_rule(ruleno, f.get());
6645     }
6646     ostringstream rs;
6647     f->flush(rs);
6648     rs << "\n";
6649     rdata.append(rs.str());
6650   } else if (prefix == "osd crush dump") {
6651     string format;
6652     cmd_getval(cmdmap, "format", format);
6653     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6654     f->open_object_section("crush_map");
6655     osdmap.crush->dump(f.get());
6656     f->close_section();
6657     ostringstream rs;
6658     f->flush(rs);
6659     rs << "\n";
6660     rdata.append(rs.str());
6661   } else if (prefix == "osd crush show-tunables") {
6662     string format;
6663     cmd_getval(cmdmap, "format", format);
6664     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6665     f->open_object_section("crush_map_tunables");
6666     osdmap.crush->dump_tunables(f.get());
6667     f->close_section();
6668     ostringstream rs;
6669     f->flush(rs);
6670     rs << "\n";
6671     rdata.append(rs.str());
6672   } else if (prefix == "osd crush tree") {
6673     string shadow;
6674     cmd_getval(cmdmap, "shadow", shadow);
6675     bool show_shadow = shadow == "--show-shadow";
6676     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6677     if (f) {
6678       f->open_object_section("crush_tree");
6679       osdmap.crush->dump_tree(nullptr,
6680                               f.get(),
6681                               osdmap.get_pool_names(),
6682                               show_shadow);
6683       f->close_section();
6684       f->flush(rdata);
6685     } else {
6686       ostringstream ss;
6687       osdmap.crush->dump_tree(&ss,
6688                               nullptr,
6689                               osdmap.get_pool_names(),
6690                               show_shadow);
6691       rdata.append(ss.str());
6692     }
6693   } else if (prefix == "osd crush ls") {
6694     string name;
6695     if (!cmd_getval(cmdmap, "node", name)) {
6696       ss << "no node specified";
6697       r = -EINVAL;
6698       goto reply;
6699     }
6700     if (!osdmap.crush->name_exists(name)) {
6701       ss << "node '" << name << "' does not exist";
6702       r = -ENOENT;
6703       goto reply;
6704     }
6705     int id = osdmap.crush->get_item_id(name);
6706     list<int> result;
6707     if (id >= 0) {
6708       result.push_back(id);
6709     } else {
6710       int num = osdmap.crush->get_bucket_size(id);
6711       for (int i = 0; i < num; ++i) {
6712         result.push_back(osdmap.crush->get_bucket_item(id, i));
6713       }
6714     }
6715     if (f) {
6716       f->open_array_section("items");
6717       for (auto i : result) {
6718         f->dump_string("item", osdmap.crush->get_item_name(i));
6719       }
6720       f->close_section();
6721       f->flush(rdata);
6722     } else {
6723       ostringstream ss;
6724       for (auto i : result) {
6725         ss << osdmap.crush->get_item_name(i) << "\n";
6726       }
6727       rdata.append(ss.str());
6728     }
6729     r = 0;
6730   } else if (prefix == "osd crush class ls") {
6731     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
6732     f->open_array_section("crush_classes");
6733     for (auto i : osdmap.crush->class_name)
6734       f->dump_string("class", i.second);
6735     f->close_section();
6736     f->flush(rdata);
6737   } else if (prefix == "osd crush class ls-osd") {
6738     string name;
6739     cmd_getval(cmdmap, "class", name);
6740     set<int> osds;
6741     osdmap.crush->get_devices_by_class(name, &osds);
6742     if (f) {
6743       f->open_array_section("osds");
6744       for (auto &osd: osds)
6745         f->dump_int("osd", osd);
6746       f->close_section();
6747       f->flush(rdata);
6748     } else {
6749       bool first = true;
6750       for (auto &osd : osds) {
6751         if (!first)
6752           ds << "\n";
6753         first = false;
6754         ds << osd;
6755       }
6756       rdata.append(ds);
6757     }
6758   } else if (prefix == "osd crush get-device-class") {
6759     vector<string> idvec;
6760     cmd_getval(cmdmap, "ids", idvec);
6761     map<int, string> class_by_osd;
6762     for (auto& id : idvec) {
6763       ostringstream ts;
6764       long osd = parse_osd_id(id.c_str(), &ts);
6765       if (osd < 0) {
6766         ss << "unable to parse osd id:'" << id << "'";
6767         r = -EINVAL;
6768         goto reply;
6769       }
6770       auto device_class = osdmap.crush->get_item_class(osd);
6771       if (device_class)
6772         class_by_osd[osd] = device_class;
6773       else
6774         class_by_osd[osd] = ""; // no class
6775     }
6776     if (f) {
6777       f->open_array_section("osd_device_classes");
6778       for (auto& i : class_by_osd) {
6779         f->open_object_section("osd_device_class");
6780         f->dump_int("osd", i.first);
6781         f->dump_string("device_class", i.second);
6782         f->close_section();
6783       }
6784       f->close_section();
6785       f->flush(rdata);
6786     } else {
6787       if (class_by_osd.size() == 1) {
6788         // for single input, make a clean output
6789         ds << class_by_osd.begin()->second;
6790       } else {
6791         // note that we do not group osds by class here
6792         for (auto it = class_by_osd.begin();
6793              it != class_by_osd.end();
6794              it++) {
6795           ds << "osd." << it->first << ' ' << it->second;
6796           if (next(it) != class_by_osd.end())
6797             ds << '\n';
6798         }
6799       }
6800       rdata.append(ds);
6801     }
6802   } else if (prefix == "osd erasure-code-profile ls") {
6803     const auto &profiles = osdmap.get_erasure_code_profiles();
6804     if (f)
6805       f->open_array_section("erasure-code-profiles");
6806     for (auto i = profiles.begin(); i != profiles.end(); ++i) {
6807       if (f)
6808         f->dump_string("profile", i->first.c_str());
6809       else
6810         rdata.append(i->first + "\n");
6811     }
6812     if (f) {
6813       f->close_section();
6814       ostringstream rs;
6815       f->flush(rs);
6816       rs << "\n";
6817       rdata.append(rs.str());
6818     }
6819   } else if (prefix == "osd crush weight-set ls") {
6820     boost::scoped_ptr<Formatter> f(Formatter::create(format));
6821     if (f) {
6822       f->open_array_section("weight_sets");
6823       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6824         f->dump_string("pool", "(compat)");
6825       }
6826       for (auto& i : osdmap.crush->choose_args) {
6827         if (i.first >= 0) {
6828           f->dump_string("pool", osdmap.get_pool_name(i.first));
6829         }
6830       }
6831       f->close_section();
6832       f->flush(rdata);
6833     } else {
6834       ostringstream rs;
6835       if (osdmap.crush->have_choose_args(CrushWrapper::DEFAULT_CHOOSE_ARGS)) {
6836         rs << "(compat)\n";
6837       }
6838       for (auto& i : osdmap.crush->choose_args) {
6839         if (i.first >= 0) {
6840           rs << osdmap.get_pool_name(i.first) << "\n";
6841         }
6842       }
6843       rdata.append(rs.str());
6844     }
6845   } else if (prefix == "osd crush weight-set dump") {
6846     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6847                                                      "json-pretty"));
6848     osdmap.crush->dump_choose_args(f.get());
6849     f->flush(rdata);
6850   } else if (prefix == "osd erasure-code-profile get") {
6851     string name;
6852     cmd_getval(cmdmap, "name", name);
6853     if (!osdmap.has_erasure_code_profile(name)) {
6854       ss << "unknown erasure code profile '" << name << "'";
6855       r = -ENOENT;
6856       goto reply;
6857     }
6858     const map<string,string> &profile = osdmap.get_erasure_code_profile(name);
6859     if (f)
6860       f->open_object_section("profile");
6861     for (map<string,string>::const_iterator i = profile.begin();
6862          i != profile.end();
6863          ++i) {
6864       if (f)
6865         f->dump_string(i->first.c_str(), i->second.c_str());
6866       else
6867         rdata.append(i->first + "=" + i->second + "\n");
6868     }
6869     if (f) {
6870       f->close_section();
6871       ostringstream rs;
6872       f->flush(rs);
6873       rs << "\n";
6874       rdata.append(rs.str());
6875     }
6876   } else if (prefix == "osd pool application get") {
6877     boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
6878                                                      "json-pretty"));
6879     string pool_name;
6880     cmd_getval(cmdmap, "pool", pool_name);
6881     string app;
6882     cmd_getval(cmdmap, "app", app);
6883     string key;
6884     cmd_getval(cmdmap, "key", key);
6885
6886     if (pool_name.empty()) {
6887       // all
6888       f->open_object_section("pools");
6889       for (const auto &pool : osdmap.pools) {
6890         std::string name("<unknown>");
6891         const auto &pni = osdmap.pool_name.find(pool.first);
6892         if (pni != osdmap.pool_name.end())
6893           name = pni->second;
6894         f->open_object_section(name.c_str());
6895         for (auto &app_pair : pool.second.application_metadata) {
6896           f->open_object_section(app_pair.first.c_str());
6897           for (auto &kv_pair : app_pair.second) {
6898             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6899           }
6900           f->close_section();
6901         }
6902         f->close_section(); // name
6903       }
6904       f->close_section(); // pools
6905       f->flush(rdata);
6906     } else {
6907       int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
6908       if (pool < 0) {
6909         ss << "unrecognized pool '" << pool_name << "'";
6910         r = -ENOENT;
6911         goto reply;
6912       }
6913       auto p = osdmap.get_pg_pool(pool);
6914       // filter by pool
6915       if (app.empty()) {
6916         f->open_object_section(pool_name.c_str());
6917         for (auto &app_pair : p->application_metadata) {
6918           f->open_object_section(app_pair.first.c_str());
6919           for (auto &kv_pair : app_pair.second) {
6920             f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6921           }
6922           f->close_section(); // application
6923         }
6924         f->close_section(); // pool_name
6925         f->flush(rdata);
6926         goto reply;
6927       }
6928
6929       auto app_it = p->application_metadata.find(app);
6930       if (app_it == p->application_metadata.end()) {
6931         ss << "pool '" << pool_name << "' has no application '" << app << "'";
6932         r = -ENOENT;
6933         goto reply;
6934       }
6935       // filter by pool + app
6936       if (key.empty()) {
6937         f->open_object_section(app_it->first.c_str());
6938         for (auto &kv_pair : app_it->second) {
6939           f->dump_string(kv_pair.first.c_str(), kv_pair.second);
6940         }
6941         f->close_section(); // application
6942         f->flush(rdata);
6943         goto reply;
6944       }
6945       // filter by pool + app + key
6946       auto key_it = app_it->second.find(key);
6947       if (key_it == app_it->second.end()) {
6948         ss << "application '" << app << "' on pool '" << pool_name
6949            << "' does not have key '" << key << "'";
6950         r = -ENOENT;
6951         goto reply;
6952       }
6953       ss << key_it->second << "\n";
6954       rdata.append(ss.str());
6955       ss.str("");
6956     }
6957   } else if (prefix == "osd get-require-min-compat-client") {
6958     ss << osdmap.require_min_compat_client << std::endl;
6959     rdata.append(ss.str());
6960     ss.str("");
6961     goto reply;
6962   } else if (prefix == "osd pool application enable" ||
6963              prefix == "osd pool application disable" ||
6964              prefix == "osd pool application set" ||
6965              prefix == "osd pool application rm") {
6966     bool changed = false;
6967     r = preprocess_command_pool_application(prefix, cmdmap, ss, &changed);
6968     if (r != 0) {
6969       // Error, reply.
6970       goto reply;
6971     } else if (changed) {
6972       // Valid mutation, proceed to prepare phase
6973       return false;
6974     } else {
6975       // Idempotent case, reply
6976       goto reply;
6977     }
6978   } else {
6979     // try prepare update
6980     return false;
6981   }
6982
6983  reply:
6984   string rs;
6985   getline(ss, rs);
6986   mon.reply_command(op, r, rs, rdata, get_last_committed());
6987   return true;
6988 }
6989
6990 void OSDMonitor::set_pool_flags(int64_t pool_id, uint64_t flags)
6991 {
6992   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
6993     osdmap.get_pg_pool(pool_id));
6994   ceph_assert(pool);
6995   pool->set_flag(flags);
6996 }
6997
6998 void OSDMonitor::clear_pool_flags(int64_t pool_id, uint64_t flags)
6999 {
7000   pg_pool_t *pool = pending_inc.get_new_pool(pool_id,
7001     osdmap.get_pg_pool(pool_id));
7002   ceph_assert(pool);
7003   pool->unset_flag(flags);
7004 }
7005
7006 string OSDMonitor::make_purged_snap_epoch_key(epoch_t epoch)
7007 {
7008   char k[80];
7009   snprintf(k, sizeof(k), "purged_epoch_%08lx", (unsigned long)epoch);
7010   return k;
7011 }
7012
7013 string OSDMonitor::make_purged_snap_key(int64_t pool, snapid_t snap)
7014 {
7015   char k[80];
7016   snprintf(k, sizeof(k), "purged_snap_%llu_%016llx",
7017            (unsigned long long)pool, (unsigned long long)snap);
7018   return k;
7019 }
7020
7021 string OSDMonitor::make_purged_snap_key_value(
7022   int64_t pool, snapid_t snap, snapid_t num,
7023   epoch_t epoch, bufferlist *v)
7024 {
7025   // encode the *last* epoch in the key so that we can use forward
7026   // iteration only to search for an epoch in an interval.
7027   encode(snap, *v);
7028   encode(snap + num, *v);
7029   encode(epoch, *v);
7030   return make_purged_snap_key(pool, snap + num - 1);
7031 }
7032
7033
7034 int OSDMonitor::lookup_purged_snap(
7035   int64_t pool, snapid_t snap,
7036   snapid_t *begin, snapid_t *end)
7037 {
7038   string k = make_purged_snap_key(pool, snap);
7039   auto it = mon.store->get_iterator(OSD_SNAP_PREFIX);
7040   it->lower_bound(k);
7041   if (!it->valid()) {
7042     dout(20) << __func__
7043              << " pool " << pool << " snap " << snap
7044              << " - key '" << k << "' not found" << dendl;
7045     return -ENOENT;
7046   }
7047   if (it->key().find("purged_snap_") != 0) {
7048     dout(20) << __func__
7049              << " pool " << pool << " snap " << snap
7050              << " - key '" << k << "' got '" << it->key()
7051              << "', wrong prefix" << dendl;
7052     return -ENOENT;
7053   }
7054   string gotk = it->key();
7055   const char *format = "purged_snap_%llu_";
7056   long long int keypool;
7057   int n = sscanf(gotk.c_str(), format, &keypool);
7058   if (n != 1) {
7059     derr << __func__ << " invalid k '" << gotk << "'" << dendl;
7060     return -ENOENT;
7061   }
7062   if (pool != keypool) {
7063     dout(20) << __func__
7064              << " pool " << pool << " snap " << snap
7065              << " - key '" << k << "' got '" << gotk
7066              << "', wrong pool " << keypool
7067              << dendl;
7068     return -ENOENT;
7069   }
7070   bufferlist v = it->value();
7071   auto p = v.cbegin();
7072   decode(*begin, p);
7073   decode(*end, p);
7074   if (snap < *begin || snap >= *end) {
7075     dout(20) << __func__
7076              << " pool " << pool << " snap " << snap
7077              << " - found [" << *begin << "," << *end << "), no overlap"
7078              << dendl;
7079     return -ENOENT;
7080   }
7081   return 0;
7082 }
7083
7084 void OSDMonitor::insert_purged_snap_update(
7085   int64_t pool,
7086   snapid_t start, snapid_t end,
7087   epoch_t epoch,
7088   MonitorDBStore::TransactionRef t)
7089 {
7090   snapid_t before_begin, before_end;
7091   snapid_t after_begin, after_end;
7092   int b = lookup_purged_snap(pool, start - 1,
7093                              &before_begin, &before_end);
7094   int a = lookup_purged_snap(pool, end,
7095                              &after_begin, &after_end);
7096   if (!b && !a) {
7097     dout(10) << __func__
7098              << " [" << start << "," << end << ") - joins ["
7099              << before_begin << "," << before_end << ") and ["
7100              << after_begin << "," << after_end << ")" << dendl;
7101     // erase only the begin record; we'll overwrite the end one.
7102     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7103     bufferlist v;
7104     string k = make_purged_snap_key_value(pool,
7105                                           before_begin, after_end - before_begin,
7106                                           pending_inc.epoch, &v);
7107     t->put(OSD_SNAP_PREFIX, k, v);
7108   } else if (!b) {
7109     dout(10) << __func__
7110              << " [" << start << "," << end << ") - join with earlier ["
7111              << before_begin << "," << before_end << ")" << dendl;
7112     t->erase(OSD_SNAP_PREFIX, make_purged_snap_key(pool, before_end - 1));
7113     bufferlist v;
7114     string k = make_purged_snap_key_value(pool,
7115                                           before_begin, end - before_begin,
7116                                           pending_inc.epoch, &v);
7117     t->put(OSD_SNAP_PREFIX, k, v);
7118   } else if (!a) {
7119     dout(10) << __func__
7120              << " [" << start << "," << end << ") - join with later ["
7121              << after_begin << "," << after_end << ")" << dendl;
7122     // overwrite after record
7123     bufferlist v;
7124     string k = make_purged_snap_key_value(pool,
7125                                           start, after_end - start,
7126                                           pending_inc.epoch, &v);
7127     t->put(OSD_SNAP_PREFIX, k, v);
7128   } else {
7129     dout(10) << __func__
7130              << " [" << start << "," << end << ") - new"
7131              << dendl;
7132     bufferlist v;
7133     string k = make_purged_snap_key_value(pool,
7134                                           start, end - start,
7135                                           pending_inc.epoch, &v);
7136     t->put(OSD_SNAP_PREFIX, k, v);
7137   }
7138 }
7139
7140 bool OSDMonitor::try_prune_purged_snaps()
7141 {
7142   if (!mon.mgrstatmon()->is_readable()) {
7143     return false;
7144   }
7145   if (!pending_inc.new_purged_snaps.empty()) {
7146     return false;  // we already pruned for this epoch
7147   }
7148
7149   unsigned max_prune = cct->_conf.get_val<uint64_t>(
7150     "mon_max_snap_prune_per_epoch");
7151   if (!max_prune) {
7152     max_prune = 100000;
7153   }
7154   dout(10) << __func__ << " max_prune " << max_prune << dendl;
7155
7156   unsigned actually_pruned = 0;
7157   auto& purged_snaps = mon.mgrstatmon()->get_digest().purged_snaps;
7158   for (auto& p : osdmap.get_pools()) {
7159     auto q = purged_snaps.find(p.first);
7160     if (q == purged_snaps.end()) {
7161       continue;
7162     }
7163     auto& purged = q->second;
7164     if (purged.empty()) {
7165       dout(20) << __func__ << " " << p.first << " nothing purged" << dendl;
7166       continue;
7167     }
7168     dout(20) << __func__ << " pool " << p.first << " purged " << purged << dendl;
7169     snap_interval_set_t to_prune;
7170     unsigned maybe_pruned = actually_pruned;
7171     for (auto i = purged.begin(); i != purged.end(); ++i) {
7172       snapid_t begin = i.get_start();
7173       auto end = i.get_start() + i.get_len();
7174       snapid_t pbegin = 0, pend = 0;
7175       int r = lookup_purged_snap(p.first, begin, &pbegin, &pend);
7176       if (r == 0) {
7177         // already purged.
7178         // be a bit aggressive about backing off here, because the mon may
7179         // do a lot of work going through this set, and if we know the
7180         // purged set from the OSDs is at least *partly* stale we may as
7181         // well wait for it to be fresh.
7182         dout(20) << __func__ << "  we've already purged " << pbegin
7183                  << "~" << (pend - pbegin) << dendl;
7184         break;  // next pool
7185       }
7186       if (pbegin && pbegin > begin && pbegin < end) {
7187         // the tail of [begin,end) is purged; shorten the range
7188         end = pbegin;
7189       }
7190       to_prune.insert(begin, end - begin);
7191       maybe_pruned += end - begin;
7192       if (maybe_pruned >= max_prune) {
7193         break;
7194       }
7195     }
7196     if (!to_prune.empty()) {
7197       // PGs may still be reporting things as purged that we have already
7198       // pruned from removed_snaps_queue.
7199       snap_interval_set_t actual;
7200       auto r = osdmap.removed_snaps_queue.find(p.first);
7201       if (r != osdmap.removed_snaps_queue.end()) {
7202         actual.intersection_of(to_prune, r->second);
7203       }
7204       actually_pruned += actual.size();
7205       dout(10) << __func__ << " pool " << p.first << " reports pruned " << to_prune
7206                << ", actual pruned " << actual << dendl;
7207       if (!actual.empty()) {
7208         pending_inc.new_purged_snaps[p.first].swap(actual);
7209       }
7210     }
7211     if (actually_pruned >= max_prune) {
7212       break;
7213     }
7214   }
7215   dout(10) << __func__ << " actually pruned " << actually_pruned << dendl;
7216   return !!actually_pruned;
7217 }
7218
7219 bool OSDMonitor::update_pools_status()
7220 {
7221   if (!mon.mgrstatmon()->is_readable())
7222     return false;
7223
7224   bool ret = false;
7225
7226   auto& pools = osdmap.get_pools();
7227   for (auto it = pools.begin(); it != pools.end(); ++it) {
7228     const pool_stat_t *pstat = mon.mgrstatmon()->get_pool_stat(it->first);
7229     if (!pstat)
7230       continue;
7231     const object_stat_sum_t& sum = pstat->stats.sum;
7232     const pg_pool_t &pool = it->second;
7233     const string& pool_name = osdmap.get_pool_name(it->first);
7234
7235     bool pool_is_full =
7236       (pool.quota_max_bytes > 0 && (uint64_t)sum.num_bytes >= pool.quota_max_bytes) ||
7237       (pool.quota_max_objects > 0 && (uint64_t)sum.num_objects >= pool.quota_max_objects);
7238
7239     if (pool.has_flag(pg_pool_t::FLAG_FULL_QUOTA)) {
7240       if (pool_is_full)
7241         continue;
7242
7243       mon.clog->info() << "pool '" << pool_name
7244                        << "' no longer out of quota; removing NO_QUOTA flag";
7245       // below we cancel FLAG_FULL too, we'll set it again in
7246       // OSDMonitor::encode_pending if it still fails the osd-full checking.
7247       clear_pool_flags(it->first,
7248                        pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7249       ret = true;
7250     } else {
7251       if (!pool_is_full)
7252         continue;
7253
7254       if (pool.quota_max_bytes > 0 &&
7255           (uint64_t)sum.num_bytes >= pool.quota_max_bytes) {
7256         mon.clog->warn() << "pool '" << pool_name << "' is full"
7257                          << " (reached quota's max_bytes: "
7258                          << byte_u_t(pool.quota_max_bytes) << ")";
7259       }
7260       if (pool.quota_max_objects > 0 &&
7261                  (uint64_t)sum.num_objects >= pool.quota_max_objects) {
7262         mon.clog->warn() << "pool '" << pool_name << "' is full"
7263                          << " (reached quota's max_objects: "
7264                          << pool.quota_max_objects << ")";
7265       }
7266       // set both FLAG_FULL_QUOTA and FLAG_FULL
7267       // note that below we try to cancel FLAG_BACKFILLFULL/NEARFULL too
7268       // since FLAG_FULL should always take precedence
7269       set_pool_flags(it->first,
7270                      pg_pool_t::FLAG_FULL_QUOTA | pg_pool_t::FLAG_FULL);
7271       clear_pool_flags(it->first,
7272                        pg_pool_t::FLAG_NEARFULL |
7273                        pg_pool_t::FLAG_BACKFILLFULL);
7274       ret = true;
7275     }
7276   }
7277   return ret;
7278 }
7279
7280 int OSDMonitor::prepare_new_pool(MonOpRequestRef op)
7281 {
7282   op->mark_osdmon_event(__func__);
7283   auto m = op->get_req<MPoolOp>();
7284   dout(10) << "prepare_new_pool from " << m->get_connection() << dendl;
7285   MonSession *session = op->get_session();
7286   if (!session)
7287     return -EPERM;
7288   string erasure_code_profile;
7289   stringstream ss;
7290   string rule_name;
7291   int ret = 0;
7292   ret = prepare_new_pool(m->name, m->crush_rule, rule_name,
7293                          0, 0, 0, 0, 0, 0.0,
7294                          erasure_code_profile,
7295                          pg_pool_t::TYPE_REPLICATED, 0, FAST_READ_OFF, {},
7296                          &ss);
7297
7298   if (ret < 0) {
7299     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
7300   }
7301   return ret;
7302 }
7303
7304 int OSDMonitor::crush_rename_bucket(const string& srcname,
7305                                     const string& dstname,
7306                                     ostream *ss)
7307 {
7308   int ret;
7309   //
7310   // Avoid creating a pending crush if it does not already exists and
7311   // the rename would fail.
7312   //
7313   if (!_have_pending_crush()) {
7314     ret = _get_stable_crush().can_rename_bucket(srcname,
7315                                                 dstname,
7316                                                 ss);
7317     if (ret)
7318       return ret;
7319   }
7320
7321   CrushWrapper newcrush;
7322   _get_pending_crush(newcrush);
7323
7324   ret = newcrush.rename_bucket(srcname,
7325                                dstname,
7326                                ss);
7327   if (ret)
7328     return ret;
7329
7330   pending_inc.crush.clear();
7331   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7332   *ss << "renamed bucket " << srcname << " into " << dstname;
7333   return 0;
7334 }
7335
7336 void OSDMonitor::check_legacy_ec_plugin(const string& plugin, const string& profile) const
7337 {
7338   string replacement = "";
7339
7340   if (plugin == "jerasure_generic" ||
7341       plugin == "jerasure_sse3" ||
7342       plugin == "jerasure_sse4" ||
7343       plugin == "jerasure_neon") {
7344     replacement = "jerasure";
7345   } else if (plugin == "shec_generic" ||
7346              plugin == "shec_sse3" ||
7347              plugin == "shec_sse4" ||
7348              plugin == "shec_neon") {
7349     replacement = "shec";
7350   }
7351
7352   if (replacement != "") {
7353     dout(0) << "WARNING: erasure coding profile " << profile << " uses plugin "
7354             << plugin << " that has been deprecated. Please use "
7355             << replacement << " instead." << dendl;
7356   }
7357 }
7358
7359 int OSDMonitor::normalize_profile(const string& profilename,
7360                                   ErasureCodeProfile &profile,
7361                                   bool force,
7362                                   ostream *ss)
7363 {
7364   ErasureCodeInterfaceRef erasure_code;
7365   ErasureCodePluginRegistry &instance = ErasureCodePluginRegistry::instance();
7366   ErasureCodeProfile::const_iterator plugin = profile.find("plugin");
7367   check_legacy_ec_plugin(plugin->second, profilename);
7368   int err = instance.factory(plugin->second,
7369                              g_conf().get_val<std::string>("erasure_code_dir"),
7370                              profile, &erasure_code, ss);
7371   if (err) {
7372     return err;
7373   }
7374
7375   err = erasure_code->init(profile, ss);
7376   if (err) {
7377     return err;
7378   }
7379
7380   auto it = profile.find("stripe_unit");
7381   if (it != profile.end()) {
7382     string err_str;
7383     uint32_t stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7384     if (!err_str.empty()) {
7385       *ss << "could not parse stripe_unit '" << it->second
7386           << "': " << err_str << std::endl;
7387       return -EINVAL;
7388     }
7389     uint32_t data_chunks = erasure_code->get_data_chunk_count();
7390     uint32_t chunk_size = erasure_code->get_chunk_size(stripe_unit * data_chunks);
7391     if (chunk_size != stripe_unit) {
7392       *ss << "stripe_unit " << stripe_unit << " does not match ec profile "
7393           << "alignment. Would be padded to " << chunk_size
7394           << std::endl;
7395       return -EINVAL;
7396     }
7397     if ((stripe_unit % 4096) != 0 && !force) {
7398       *ss << "stripe_unit should be a multiple of 4096 bytes for best performance."
7399           << "use --force to override this check" << std::endl;
7400       return -EINVAL;
7401     }
7402   }
7403   return 0;
7404 }
7405
7406 int OSDMonitor::crush_rule_create_erasure(const string &name,
7407                                              const string &profile,
7408                                              int *rule,
7409                                              ostream *ss)
7410 {
7411   int ruleid = osdmap.crush->get_rule_id(name);
7412   if (ruleid != -ENOENT) {
7413     *rule = osdmap.crush->get_rule_mask_ruleset(ruleid);
7414     return -EEXIST;
7415   }
7416
7417   CrushWrapper newcrush;
7418   _get_pending_crush(newcrush);
7419
7420   ruleid = newcrush.get_rule_id(name);
7421   if (ruleid != -ENOENT) {
7422     *rule = newcrush.get_rule_mask_ruleset(ruleid);
7423     return -EALREADY;
7424   } else {
7425     ErasureCodeInterfaceRef erasure_code;
7426     int err = get_erasure_code(profile, &erasure_code, ss);
7427     if (err) {
7428       *ss << "failed to load plugin using profile " << profile << std::endl;
7429       return err;
7430     }
7431
7432     err = erasure_code->create_rule(name, newcrush, ss);
7433     erasure_code.reset();
7434     if (err < 0)
7435       return err;
7436     *rule = err;
7437     pending_inc.crush.clear();
7438     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
7439     return 0;
7440   }
7441 }
7442
7443 int OSDMonitor::get_erasure_code(const string &erasure_code_profile,
7444                                  ErasureCodeInterfaceRef *erasure_code,
7445                                  ostream *ss) const
7446 {
7447   if (pending_inc.has_erasure_code_profile(erasure_code_profile))
7448     return -EAGAIN;
7449   ErasureCodeProfile profile =
7450     osdmap.get_erasure_code_profile(erasure_code_profile);
7451   ErasureCodeProfile::const_iterator plugin =
7452     profile.find("plugin");
7453   if (plugin == profile.end()) {
7454     *ss << "cannot determine the erasure code plugin"
7455         << " because there is no 'plugin' entry in the erasure_code_profile "
7456         << profile << std::endl;
7457     return -EINVAL;
7458   }
7459   check_legacy_ec_plugin(plugin->second, erasure_code_profile);
7460   auto& instance = ErasureCodePluginRegistry::instance();
7461   return instance.factory(plugin->second,
7462                           g_conf().get_val<std::string>("erasure_code_dir"),
7463                           profile, erasure_code, ss);
7464 }
7465
7466 int OSDMonitor::check_cluster_features(uint64_t features,
7467                                        stringstream &ss)
7468 {
7469   stringstream unsupported_ss;
7470   int unsupported_count = 0;
7471   if ((mon.get_quorum_con_features() & features) != features) {
7472     unsupported_ss << "the monitor cluster";
7473     ++unsupported_count;
7474   }
7475
7476   set<int32_t> up_osds;
7477   osdmap.get_up_osds(up_osds);
7478   for (set<int32_t>::iterator it = up_osds.begin();
7479        it != up_osds.end(); ++it) {
7480     const osd_xinfo_t &xi = osdmap.get_xinfo(*it);
7481     if ((xi.features & features) != features) {
7482       if (unsupported_count > 0)
7483         unsupported_ss << ", ";
7484       unsupported_ss << "osd." << *it;
7485       unsupported_count ++;
7486     }
7487   }
7488
7489   if (unsupported_count > 0) {
7490     ss << "features " << features << " unsupported by: "
7491        << unsupported_ss.str();
7492     return -ENOTSUP;
7493   }
7494
7495   // check pending osd state, too!
7496   for (map<int32_t,osd_xinfo_t>::const_iterator p =
7497          pending_inc.new_xinfo.begin();
7498        p != pending_inc.new_xinfo.end(); ++p) {
7499     const osd_xinfo_t &xi = p->second;
7500     if ((xi.features & features) != features) {
7501       dout(10) << __func__ << " pending osd." << p->first
7502                << " features are insufficient; retry" << dendl;
7503       return -EAGAIN;
7504     }
7505   }
7506
7507   return 0;
7508 }
7509
7510 bool OSDMonitor::validate_crush_against_features(const CrushWrapper *newcrush,
7511                                                  stringstream& ss)
7512 {
7513   OSDMap::Incremental new_pending = pending_inc;
7514   encode(*newcrush, new_pending.crush, mon.get_quorum_con_features());
7515   OSDMap newmap;
7516   newmap.deepish_copy_from(osdmap);
7517   newmap.apply_incremental(new_pending);
7518
7519   // client compat
7520   if (newmap.require_min_compat_client != ceph_release_t::unknown) {
7521     auto mv = newmap.get_min_compat_client();
7522     if (mv > newmap.require_min_compat_client) {
7523       ss << "new crush map requires client version " << mv
7524          << " but require_min_compat_client is "
7525          << newmap.require_min_compat_client;
7526       return false;
7527     }
7528   }
7529
7530   // osd compat
7531   uint64_t features =
7532     newmap.get_features(CEPH_ENTITY_TYPE_MON, NULL) |
7533     newmap.get_features(CEPH_ENTITY_TYPE_OSD, NULL);
7534   stringstream features_ss;
7535   int r = check_cluster_features(features, features_ss);
7536   if (r) {
7537     ss << "Could not change CRUSH: " << features_ss.str();
7538     return false;
7539   }
7540
7541   return true;
7542 }
7543
7544 bool OSDMonitor::erasure_code_profile_in_use(
7545   const mempool::osdmap::map<int64_t, pg_pool_t> &pools,
7546   const string &profile,
7547   ostream *ss)
7548 {
7549   bool found = false;
7550   for (map<int64_t, pg_pool_t>::const_iterator p = pools.begin();
7551        p != pools.end();
7552        ++p) {
7553     if (p->second.erasure_code_profile == profile && p->second.is_erasure()) {
7554       *ss << osdmap.pool_name[p->first] << " ";
7555       found = true;
7556     }
7557   }
7558   if (found) {
7559     *ss << "pool(s) are using the erasure code profile '" << profile << "'";
7560   }
7561   return found;
7562 }
7563
7564 int OSDMonitor::parse_erasure_code_profile(const vector<string> &erasure_code_profile,
7565                                            map<string,string> *erasure_code_profile_map,
7566                                            ostream *ss)
7567 {
7568   int r = g_conf().with_val<string>("osd_pool_default_erasure_code_profile",
7569                                    get_json_str_map,
7570                                    *ss,
7571                                    erasure_code_profile_map,
7572                                    true);
7573   if (r)
7574     return r;
7575   ceph_assert((*erasure_code_profile_map).count("plugin"));
7576   string default_plugin = (*erasure_code_profile_map)["plugin"];
7577   map<string,string> user_map;
7578   for (vector<string>::const_iterator i = erasure_code_profile.begin();
7579        i != erasure_code_profile.end();
7580        ++i) {
7581     size_t equal = i->find('=');
7582     if (equal == string::npos) {
7583       user_map[*i] = string();
7584       (*erasure_code_profile_map)[*i] = string();
7585     } else {
7586       const string key = i->substr(0, equal);
7587       equal++;
7588       const string value = i->substr(equal);
7589       if (key.find("ruleset-") == 0) {
7590         *ss << "property '" << key << "' is no longer supported; try "
7591             << "'crush-" << key.substr(8) << "' instead";
7592         return -EINVAL;
7593       }
7594       user_map[key] = value;
7595       (*erasure_code_profile_map)[key] = value;
7596     }
7597   }
7598
7599   if (user_map.count("plugin") && user_map["plugin"] != default_plugin)
7600     (*erasure_code_profile_map) = user_map;
7601
7602   return 0;
7603 }
7604
7605 int OSDMonitor::prepare_pool_size(const unsigned pool_type,
7606                                   const string &erasure_code_profile,
7607                                   uint8_t repl_size,
7608                                   unsigned *size, unsigned *min_size,
7609                                   ostream *ss)
7610 {
7611   int err = 0;
7612   bool set_min_size = false;
7613   switch (pool_type) {
7614   case pg_pool_t::TYPE_REPLICATED:
7615     if (osdmap.stretch_mode_enabled) {
7616       if (repl_size == 0)
7617         repl_size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
7618       if (repl_size != g_conf().get_val<uint64_t>("mon_stretch_pool_size")) {
7619         *ss << "prepare_pool_size: we are in stretch mode but size "
7620            << repl_size << " does not match!";
7621         return -EINVAL;
7622       }
7623       *min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
7624       set_min_size = true;
7625     }
7626     if (repl_size == 0) {
7627       repl_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
7628     }
7629     *size = repl_size;
7630     if (!set_min_size)
7631       *min_size = g_conf().get_osd_pool_default_min_size(repl_size);
7632     break;
7633   case pg_pool_t::TYPE_ERASURE:
7634     {
7635       if (osdmap.stretch_mode_enabled) {
7636         *ss << "prepare_pool_size: we are in stretch mode; cannot create EC pools!";
7637         return -EINVAL;
7638       }
7639       ErasureCodeInterfaceRef erasure_code;
7640       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7641       if (err == 0) {
7642         *size = erasure_code->get_chunk_count();
7643         *min_size =
7644           erasure_code->get_data_chunk_count() +
7645           std::min<int>(1, erasure_code->get_coding_chunk_count() - 1);
7646         assert(*min_size <= *size);
7647         assert(*min_size >= erasure_code->get_data_chunk_count());
7648       }
7649     }
7650     break;
7651   default:
7652     *ss << "prepare_pool_size: " << pool_type << " is not a known pool type";
7653     err = -EINVAL;
7654     break;
7655   }
7656   return err;
7657 }
7658
7659 int OSDMonitor::prepare_pool_stripe_width(const unsigned pool_type,
7660                                           const string &erasure_code_profile,
7661                                           uint32_t *stripe_width,
7662                                           ostream *ss)
7663 {
7664   int err = 0;
7665   switch (pool_type) {
7666   case pg_pool_t::TYPE_REPLICATED:
7667     // ignored
7668     break;
7669   case pg_pool_t::TYPE_ERASURE:
7670     {
7671       ErasureCodeProfile profile =
7672         osdmap.get_erasure_code_profile(erasure_code_profile);
7673       ErasureCodeInterfaceRef erasure_code;
7674       err = get_erasure_code(erasure_code_profile, &erasure_code, ss);
7675       if (err)
7676         break;
7677       uint32_t data_chunks = erasure_code->get_data_chunk_count();
7678       uint32_t stripe_unit = g_conf().get_val<Option::size_t>("osd_pool_erasure_code_stripe_unit");
7679       auto it = profile.find("stripe_unit");
7680       if (it != profile.end()) {
7681         string err_str;
7682         stripe_unit = strict_iecstrtoll(it->second.c_str(), &err_str);
7683         ceph_assert(err_str.empty());
7684       }
7685       *stripe_width = data_chunks *
7686         erasure_code->get_chunk_size(stripe_unit * data_chunks);
7687     }
7688     break;
7689   default:
7690     *ss << "prepare_pool_stripe_width: "
7691        << pool_type << " is not a known pool type";
7692     err = -EINVAL;
7693     break;
7694   }
7695   return err;
7696 }
7697
7698 int OSDMonitor::prepare_pool_crush_rule(const unsigned pool_type,
7699                                         const string &erasure_code_profile,
7700                                         const string &rule_name,
7701                                         int *crush_rule,
7702                                         ostream *ss)
7703 {
7704
7705   if (*crush_rule < 0) {
7706     switch (pool_type) {
7707     case pg_pool_t::TYPE_REPLICATED:
7708       {
7709         if (rule_name == "") {
7710           // Use default rule
7711           *crush_rule = osdmap.crush->get_osd_pool_default_crush_replicated_ruleset(cct);
7712           if (*crush_rule < 0) {
7713             // Errors may happen e.g. if no valid rule is available
7714             *ss << "No suitable CRUSH rule exists, check "
7715                 << "'osd pool default crush *' config options";
7716             return -ENOENT;
7717           }
7718         } else {
7719           return get_crush_rule(rule_name, crush_rule, ss);
7720         }
7721       }
7722       break;
7723     case pg_pool_t::TYPE_ERASURE:
7724       {
7725         int err = crush_rule_create_erasure(rule_name,
7726                                                erasure_code_profile,
7727                                                crush_rule, ss);
7728         switch (err) {
7729         case -EALREADY:
7730           dout(20) << "prepare_pool_crush_rule: rule "
7731                    << rule_name << " try again" << dendl;
7732           // fall through
7733         case 0:
7734           // need to wait for the crush rule to be proposed before proceeding
7735           err = -EAGAIN;
7736           break;
7737         case -EEXIST:
7738           err = 0;
7739           break;
7740         }
7741         return err;
7742       }
7743       break;
7744     default:
7745       *ss << "prepare_pool_crush_rule: " << pool_type
7746          << " is not a known pool type";
7747       return -EINVAL;
7748     }
7749   } else {
7750     if (!osdmap.crush->ruleset_exists(*crush_rule)) {
7751       *ss << "CRUSH rule " << *crush_rule << " not found";
7752       return -ENOENT;
7753     }
7754   }
7755
7756   return 0;
7757 }
7758
7759 int OSDMonitor::get_crush_rule(const string &rule_name,
7760                                int *crush_rule,
7761                                ostream *ss)
7762 {
7763   int ret;
7764   ret = osdmap.crush->get_rule_id(rule_name);
7765   if (ret != -ENOENT) {
7766     // found it, use it
7767     *crush_rule = ret;
7768   } else {
7769     CrushWrapper newcrush;
7770     _get_pending_crush(newcrush);
7771
7772     ret = newcrush.get_rule_id(rule_name);
7773     if (ret != -ENOENT) {
7774       // found it, wait for it to be proposed
7775       dout(20) << __func__ << ": rule " << rule_name
7776                << " try again" << dendl;
7777       return -EAGAIN;
7778     } else {
7779       // Cannot find it , return error
7780       *ss << "specified rule " << rule_name << " doesn't exist";
7781       return ret;
7782     }
7783   }
7784   return 0;
7785 }
7786
7787 int OSDMonitor::check_pg_num(int64_t pool, int pg_num, int size, ostream *ss)
7788 {
7789   auto max_pgs_per_osd = g_conf().get_val<uint64_t>("mon_max_pg_per_osd");
7790   auto num_osds = std::max(osdmap.get_num_in_osds(), 3u);   // assume min cluster size 3
7791   auto max_pgs = max_pgs_per_osd * num_osds;
7792   uint64_t projected = 0;
7793   if (pool < 0) {
7794     projected += pg_num * size;
7795   }
7796   for (const auto& i : osdmap.get_pools()) {
7797     if (i.first == pool) {
7798       projected += pg_num * size;
7799     } else {
7800       projected += i.second.get_pg_num_target() * i.second.get_size();
7801     }
7802   }
7803   if (projected > max_pgs) {
7804     if (pool >= 0) {
7805       *ss << "pool id " << pool;
7806     }
7807     *ss << " pg_num " << pg_num << " size " << size
7808         << " would mean " << projected
7809         << " total pgs, which exceeds max " << max_pgs
7810         << " (mon_max_pg_per_osd " << max_pgs_per_osd
7811         << " * num_in_osds " << num_osds << ")";
7812     return -ERANGE;
7813   }
7814   return 0;
7815 }
7816
7817 /**
7818  * @param name The name of the new pool
7819  * @param crush_rule The crush rule to use. If <0, will use the system default
7820  * @param crush_rule_name The crush rule to use, if crush_rulset <0
7821  * @param pg_num The pg_num to use. If set to 0, will use the system default
7822  * @param pgp_num The pgp_num to use. If set to 0, will use the system default
7823  * @param repl_size Replication factor, or 0 for default
7824  * @param erasure_code_profile The profile name in OSDMap to be used for erasure code
7825  * @param pool_type TYPE_ERASURE, or TYPE_REP
7826  * @param expected_num_objects expected number of objects on the pool
7827  * @param fast_read fast read type.
7828  * @param ss human readable error message, if any.
7829  *
7830  * @return 0 on success, negative errno on failure.
7831  */
7832 int OSDMonitor::prepare_new_pool(string& name,
7833                                  int crush_rule,
7834                                  const string &crush_rule_name,
7835                                  unsigned pg_num, unsigned pgp_num,
7836                                  unsigned pg_num_min,
7837                                  const uint64_t repl_size,
7838                                  const uint64_t target_size_bytes,
7839                                  const float target_size_ratio,
7840                                  const string &erasure_code_profile,
7841                                  const unsigned pool_type,
7842                                  const uint64_t expected_num_objects,
7843                                  FastReadType fast_read,
7844                                  const string& pg_autoscale_mode,
7845                                  ostream *ss)
7846 {
7847   if (name.length() == 0)
7848     return -EINVAL;
7849   if (pg_num == 0)
7850     pg_num = g_conf().get_val<uint64_t>("osd_pool_default_pg_num");
7851   if (pgp_num == 0)
7852     pgp_num = g_conf().get_val<uint64_t>("osd_pool_default_pgp_num");
7853   if (!pgp_num)
7854     pgp_num = pg_num;
7855   if (pg_num > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
7856     *ss << "'pg_num' must be greater than 0 and less than or equal to "
7857         << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
7858         << " (you may adjust 'mon max pool pg num' for higher values)";
7859     return -ERANGE;
7860   }
7861   if (pgp_num > pg_num) {
7862     *ss << "'pgp_num' must be greater than 0 and lower or equal than 'pg_num'"
7863         << ", which in this case is " << pg_num;
7864     return -ERANGE;
7865   }
7866   if (pool_type == pg_pool_t::TYPE_REPLICATED && fast_read == FAST_READ_ON) {
7867     *ss << "'fast_read' can only apply to erasure coding pool";
7868     return -EINVAL;
7869   }
7870   int r;
7871   r = prepare_pool_crush_rule(pool_type, erasure_code_profile,
7872                                  crush_rule_name, &crush_rule, ss);
7873   if (r) {
7874     dout(10) << "prepare_pool_crush_rule returns " << r << dendl;
7875     return r;
7876   }
7877   if (g_conf()->mon_osd_crush_smoke_test) {
7878     CrushWrapper newcrush;
7879     _get_pending_crush(newcrush);
7880     ostringstream err;
7881     CrushTester tester(newcrush, err);
7882     tester.set_min_x(0);
7883     tester.set_max_x(50);
7884     tester.set_rule(crush_rule);
7885     auto start = ceph::coarse_mono_clock::now();
7886     r = tester.test_with_fork(g_conf()->mon_lease);
7887     auto duration = ceph::coarse_mono_clock::now() - start;
7888     if (r < 0) {
7889       dout(10) << "tester.test_with_fork returns " << r
7890                << ": " << err.str() << dendl;
7891       *ss << "crush test failed with " << r << ": " << err.str();
7892       return r;
7893     }
7894     dout(10) << __func__ << " crush smoke test duration: "
7895              << duration << dendl;
7896   }
7897   unsigned size, min_size;
7898   r = prepare_pool_size(pool_type, erasure_code_profile, repl_size,
7899                         &size, &min_size, ss);
7900   if (r) {
7901     dout(10) << "prepare_pool_size returns " << r << dendl;
7902     return r;
7903   }
7904   r = check_pg_num(-1, pg_num, size, ss);
7905   if (r) {
7906     dout(10) << "check_pg_num returns " << r << dendl;
7907     return r;
7908   }
7909
7910   if (!osdmap.crush->check_crush_rule(crush_rule, pool_type, size, *ss)) {
7911     return -EINVAL;
7912   }
7913
7914   uint32_t stripe_width = 0;
7915   r = prepare_pool_stripe_width(pool_type, erasure_code_profile, &stripe_width, ss);
7916   if (r) {
7917     dout(10) << "prepare_pool_stripe_width returns " << r << dendl;
7918     return r;
7919   }
7920
7921   bool fread = false;
7922   if (pool_type == pg_pool_t::TYPE_ERASURE) {
7923     switch (fast_read) {
7924       case FAST_READ_OFF:
7925         fread = false;
7926         break;
7927       case FAST_READ_ON:
7928         fread = true;
7929         break;
7930       case FAST_READ_DEFAULT:
7931         fread = g_conf()->osd_pool_default_ec_fast_read;
7932         break;
7933       default:
7934         *ss << "invalid fast_read setting: " << fast_read;
7935         return -EINVAL;
7936     }
7937   }
7938
7939   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
7940        p != pending_inc.new_pool_names.end();
7941        ++p) {
7942     if (p->second == name)
7943       return 0;
7944   }
7945
7946   if (-1 == pending_inc.new_pool_max)
7947     pending_inc.new_pool_max = osdmap.pool_max;
7948   int64_t pool = ++pending_inc.new_pool_max;
7949   pg_pool_t empty;
7950   pg_pool_t *pi = pending_inc.get_new_pool(pool, &empty);
7951   pi->create_time = ceph_clock_now();
7952   pi->type = pool_type;
7953   pi->fast_read = fread;
7954   pi->flags = g_conf()->osd_pool_default_flags;
7955   if (g_conf()->osd_pool_default_flag_hashpspool)
7956     pi->set_flag(pg_pool_t::FLAG_HASHPSPOOL);
7957   if (g_conf()->osd_pool_default_flag_nodelete)
7958     pi->set_flag(pg_pool_t::FLAG_NODELETE);
7959   if (g_conf()->osd_pool_default_flag_nopgchange)
7960     pi->set_flag(pg_pool_t::FLAG_NOPGCHANGE);
7961   if (g_conf()->osd_pool_default_flag_nosizechange)
7962     pi->set_flag(pg_pool_t::FLAG_NOSIZECHANGE);
7963   pi->set_flag(pg_pool_t::FLAG_CREATING);
7964   if (g_conf()->osd_pool_use_gmt_hitset)
7965     pi->use_gmt_hitset = true;
7966   else
7967     pi->use_gmt_hitset = false;
7968
7969   pi->size = size;
7970   pi->min_size = min_size;
7971   pi->crush_rule = crush_rule;
7972   pi->expected_num_objects = expected_num_objects;
7973   pi->object_hash = CEPH_STR_HASH_RJENKINS;
7974   if (osdmap.stretch_mode_enabled) {
7975     pi->peering_crush_bucket_count = osdmap.stretch_bucket_count;
7976     pi->peering_crush_bucket_target = osdmap.stretch_bucket_count;
7977     pi->peering_crush_bucket_barrier = osdmap.stretch_mode_bucket;
7978     pi->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
7979     if (osdmap.degraded_stretch_mode) {
7980       pi->peering_crush_bucket_count = osdmap.degraded_stretch_mode;
7981       pi->peering_crush_bucket_target = osdmap.degraded_stretch_mode;
7982       // pi->peering_crush_bucket_mandatory_member = CRUSH_ITEM_NONE;
7983       // TODO: drat, we don't record this ^ anywhere, though given that it
7984       // necessarily won't exist elsewhere it likely doesn't matter
7985       pi->min_size = pi->min_size / 2;
7986       pi->size = pi->size / 2; // only support 2 zones now
7987     }
7988   }
7989
7990   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
7991         g_conf().get_val<string>("osd_pool_default_pg_autoscale_mode"));
7992       m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
7993     pi->pg_autoscale_mode = m;
7994   } else {
7995     pi->pg_autoscale_mode = pg_pool_t::pg_autoscale_mode_t::OFF;
7996   }
7997   auto max = g_conf().get_val<int64_t>("mon_osd_max_initial_pgs");
7998   pi->set_pg_num(
7999     max > 0 ? std::min<uint64_t>(pg_num, std::max<int64_t>(1, max))
8000     : pg_num);
8001   pi->set_pg_num_pending(pi->get_pg_num());
8002   pi->set_pg_num_target(pg_num);
8003   pi->set_pgp_num(pi->get_pg_num());
8004   pi->set_pgp_num_target(pgp_num);
8005   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8006       pg_num_min) {
8007     pi->opts.set(pool_opts_t::PG_NUM_MIN, static_cast<int64_t>(pg_num_min));
8008   }
8009   if (auto m = pg_pool_t::get_pg_autoscale_mode_by_name(
8010         pg_autoscale_mode); m != pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8011     pi->pg_autoscale_mode = m;
8012   }
8013
8014   pi->last_change = pending_inc.epoch;
8015   pi->auid = 0;
8016
8017   if (pool_type == pg_pool_t::TYPE_ERASURE) {
8018       pi->erasure_code_profile = erasure_code_profile;
8019   } else {
8020       pi->erasure_code_profile = "";
8021   }
8022   pi->stripe_width = stripe_width;
8023
8024   if (osdmap.require_osd_release >= ceph_release_t::nautilus &&
8025       target_size_bytes) {
8026     // only store for nautilus+ because TARGET_SIZE_BYTES may be
8027     // larger than int32_t max.
8028     pi->opts.set(pool_opts_t::TARGET_SIZE_BYTES, static_cast<int64_t>(target_size_bytes));
8029   }
8030   if (target_size_ratio > 0.0 &&
8031       osdmap.require_osd_release >= ceph_release_t::nautilus) {
8032     // only store for nautilus+, just to be consistent and tidy.
8033     pi->opts.set(pool_opts_t::TARGET_SIZE_RATIO, target_size_ratio);
8034   }
8035
8036   pi->cache_target_dirty_ratio_micro =
8037     g_conf()->osd_pool_default_cache_target_dirty_ratio * 1000000;
8038   pi->cache_target_dirty_high_ratio_micro =
8039     g_conf()->osd_pool_default_cache_target_dirty_high_ratio * 1000000;
8040   pi->cache_target_full_ratio_micro =
8041     g_conf()->osd_pool_default_cache_target_full_ratio * 1000000;
8042   pi->cache_min_flush_age = g_conf()->osd_pool_default_cache_min_flush_age;
8043   pi->cache_min_evict_age = g_conf()->osd_pool_default_cache_min_evict_age;
8044
8045   pending_inc.new_pool_names[pool] = name;
8046   return 0;
8047 }
8048
8049 bool OSDMonitor::prepare_set_flag(MonOpRequestRef op, int flag)
8050 {
8051   op->mark_osdmon_event(__func__);
8052   ostringstream ss;
8053   if (pending_inc.new_flags < 0)
8054     pending_inc.new_flags = osdmap.get_flags();
8055   pending_inc.new_flags |= flag;
8056   ss << OSDMap::get_flag_string(flag) << " is set";
8057   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8058                                                     get_last_committed() + 1));
8059   return true;
8060 }
8061
8062 bool OSDMonitor::prepare_unset_flag(MonOpRequestRef op, int flag)
8063 {
8064   op->mark_osdmon_event(__func__);
8065   ostringstream ss;
8066   if (pending_inc.new_flags < 0)
8067     pending_inc.new_flags = osdmap.get_flags();
8068   pending_inc.new_flags &= ~flag;
8069   ss << OSDMap::get_flag_string(flag) << " is unset";
8070   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
8071                                                     get_last_committed() + 1));
8072   return true;
8073 }
8074
8075 int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap,
8076                                          stringstream& ss)
8077 {
8078   string poolstr;
8079   cmd_getval(cmdmap, "pool", poolstr);
8080   int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
8081   if (pool < 0) {
8082     ss << "unrecognized pool '" << poolstr << "'";
8083     return -ENOENT;
8084   }
8085   string var;
8086   cmd_getval(cmdmap, "var", var);
8087
8088   pg_pool_t p = *osdmap.get_pg_pool(pool);
8089   if (pending_inc.new_pools.count(pool))
8090     p = pending_inc.new_pools[pool];
8091
8092   // accept val as a json string in the normal case (current
8093   // generation monitor).  parse out int or float values from the
8094   // string as needed.  however, if it is not a string, try to pull
8095   // out an int, in case an older monitor with an older json schema is
8096   // forwarding a request.
8097   string val;
8098   string interr, floaterr;
8099   int64_t n = 0;
8100   double f = 0;
8101   int64_t uf = 0;  // micro-f
8102   cmd_getval(cmdmap, "val", val);
8103
8104   auto si_options = {
8105     "target_max_objects"
8106   };
8107   auto iec_options = {
8108     "target_max_bytes",
8109     "target_size_bytes",
8110     "compression_max_blob_size",
8111     "compression_min_blob_size",
8112     "csum_max_block",
8113     "csum_min_block",
8114   };
8115   if (count(begin(si_options), end(si_options), var)) {
8116     n = strict_si_cast<int64_t>(val.c_str(), &interr);
8117   } else if (count(begin(iec_options), end(iec_options), var)) {
8118     n = strict_iec_cast<int64_t>(val.c_str(), &interr);
8119   } else {
8120     // parse string as both int and float; different fields use different types.
8121     n = strict_strtoll(val.c_str(), 10, &interr);
8122     f = strict_strtod(val.c_str(), &floaterr);
8123     uf = llrintl(f * (double)1000000.0);
8124   }
8125
8126   if (!p.is_tier() &&
8127       (var == "hit_set_type" || var == "hit_set_period" ||
8128        var == "hit_set_count" || var == "hit_set_fpp" ||
8129        var == "target_max_objects" || var == "target_max_bytes" ||
8130        var == "cache_target_full_ratio" || var == "cache_target_dirty_ratio" ||
8131        var == "cache_target_dirty_high_ratio" || var == "use_gmt_hitset" ||
8132        var == "cache_min_flush_age" || var == "cache_min_evict_age" ||
8133        var == "hit_set_grade_decay_rate" || var == "hit_set_search_last_n" ||
8134        var == "min_read_recency_for_promote" || var == "min_write_recency_for_promote")) {
8135     return -EACCES;
8136   }
8137
8138   if (var == "size") {
8139     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8140       ss << "pool size change is disabled; you must unset nosizechange flag for the pool first";
8141       return -EPERM;
8142     }
8143     if (p.type == pg_pool_t::TYPE_ERASURE) {
8144       ss << "can not change the size of an erasure-coded pool";
8145       return -ENOTSUP;
8146     }
8147     if (interr.length()) {
8148       ss << "error parsing integer value '" << val << "': " << interr;
8149       return -EINVAL;
8150     }
8151     if (n <= 0 || n > 10) {
8152       ss << "pool size must be between 1 and 10";
8153       return -EINVAL;
8154     }
8155     if (n == 1) {
8156       if (!g_conf().get_val<bool>("mon_allow_pool_size_one")) {
8157         ss << "configuring pool size as 1 is disabled by default.";
8158         return -EPERM;
8159       }
8160       bool sure = false;
8161       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
8162       if (!sure) { ss << "WARNING: setting pool size 1 could lead to data loss "
8163         "without recovery. If you are *ABSOLUTELY CERTAIN* that is what you want, "
8164           "pass the flag --yes-i-really-mean-it.";
8165         return -EPERM;
8166       }
8167     }
8168     if (!osdmap.crush->check_crush_rule(p.get_crush_rule(), p.type, n, ss)) {
8169       return -EINVAL;
8170     }
8171     int r = check_pg_num(pool, p.get_pg_num(), n, &ss);
8172     if (r < 0) {
8173       return r;
8174     }
8175     p.size = n;
8176     p.min_size = g_conf().get_osd_pool_default_min_size(p.size);
8177   } else if (var == "min_size") {
8178     if (p.has_flag(pg_pool_t::FLAG_NOSIZECHANGE)) {
8179       ss << "pool min size change is disabled; you must unset nosizechange flag for the pool first";
8180       return -EPERM;
8181     }
8182     if (interr.length()) {
8183       ss << "error parsing integer value '" << val << "': " << interr;
8184       return -EINVAL;
8185     }
8186
8187     if (p.type != pg_pool_t::TYPE_ERASURE) {
8188       if (n < 1 || n > p.size) {
8189         ss << "pool min_size must be between 1 and size, which is set to " << (int)p.size;
8190         return -EINVAL;
8191       }
8192     } else {
8193        ErasureCodeInterfaceRef erasure_code;
8194        int k;
8195        stringstream tmp;
8196        int err = get_erasure_code(p.erasure_code_profile, &erasure_code, &tmp);
8197        if (err == 0) {
8198          k = erasure_code->get_data_chunk_count();
8199        } else {
8200          ss << __func__ << " get_erasure_code failed: " << tmp.str();
8201          return err;
8202        }
8203
8204        if (n < k || n > p.size) {
8205          ss << "pool min_size must be between " << k << " and size, which is set to " << (int)p.size;
8206          return -EINVAL;
8207        }
8208     }
8209     p.min_size = n;
8210   } else if (var == "pg_num_actual") {
8211     if (interr.length()) {
8212       ss << "error parsing integer value '" << val << "': " << interr;
8213       return -EINVAL;
8214     }
8215     if (n == (int)p.get_pg_num()) {
8216       return 0;
8217     }
8218     if (static_cast<uint64_t>(n) > g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8219       ss << "'pg_num' must be greater than 0 and less than or equal to "
8220          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8221          << " (you may adjust 'mon max pool pg num' for higher values)";
8222       return -ERANGE;
8223     }
8224     if (p.has_flag(pg_pool_t::FLAG_CREATING)) {
8225       ss << "cannot adjust pg_num while initial PGs are being created";
8226       return -EBUSY;
8227     }
8228     if (n > (int)p.get_pg_num()) {
8229       if (p.get_pg_num() != p.get_pg_num_pending()) {
8230         // force pre-nautilus clients to resend their ops, since they
8231         // don't understand pg_num_pending changes form a new interval
8232         p.last_force_op_resend_prenautilus = pending_inc.epoch;
8233       }
8234       p.set_pg_num(n);
8235     } else {
8236       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8237         ss << "nautilus OSDs are required to adjust pg_num_pending";
8238         return -EPERM;
8239       }
8240       if (n < (int)p.get_pgp_num()) {
8241         ss << "specified pg_num " << n << " < pgp_num " << p.get_pgp_num();
8242         return -EINVAL;
8243       }
8244       if (n < (int)p.get_pg_num() - 1) {
8245         ss << "specified pg_num " << n << " < pg_num (" << p.get_pg_num()
8246            << ") - 1; only single pg decrease is currently supported";
8247         return -EINVAL;
8248       }
8249       p.set_pg_num_pending(n);
8250       // force pre-nautilus clients to resend their ops, since they
8251       // don't understand pg_num_pending changes form a new interval
8252       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8253     }
8254     // force pre-luminous clients to resend their ops, since they
8255     // don't understand that split PGs now form a new interval.
8256     p.last_force_op_resend_preluminous = pending_inc.epoch;
8257   } else if (var == "pg_num") {
8258     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8259       ss << "pool pg_num change is disabled; you must unset nopgchange flag for the pool first";
8260       return -EPERM;
8261     }
8262     if (interr.length()) {
8263       ss << "error parsing integer value '" << val << "': " << interr;
8264       return -EINVAL;
8265     }
8266     if (n == (int)p.get_pg_num_target()) {
8267       return 0;
8268     }
8269     if (n <= 0 || static_cast<uint64_t>(n) >
8270                   g_conf().get_val<uint64_t>("mon_max_pool_pg_num")) {
8271       ss << "'pg_num' must be greater than 0 and less than or equal to "
8272          << g_conf().get_val<uint64_t>("mon_max_pool_pg_num")
8273          << " (you may adjust 'mon max pool pg num' for higher values)";
8274       return -ERANGE;
8275     }
8276     if (n > (int)p.get_pg_num_target()) {
8277       int r = check_pg_num(pool, n, p.get_size(), &ss);
8278       if (r) {
8279         return r;
8280       }
8281       bool force = false;
8282       cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8283       if (p.cache_mode != pg_pool_t::CACHEMODE_NONE && !force) {
8284         ss << "splits in cache pools must be followed by scrubs and leave sufficient free space to avoid overfilling.  use --yes-i-really-mean-it to force.";
8285         return -EPERM;
8286       }
8287     } else {
8288       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8289         ss << "nautilus OSDs are required to decrease pg_num";
8290         return -EPERM;
8291       }
8292     }
8293     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8294       // pre-nautilus osdmap format; increase pg_num directly
8295       assert(n > (int)p.get_pg_num());
8296       // force pre-nautilus clients to resend their ops, since they
8297       // don't understand pg_num_target changes form a new interval
8298       p.last_force_op_resend_prenautilus = pending_inc.epoch;
8299       // force pre-luminous clients to resend their ops, since they
8300       // don't understand that split PGs now form a new interval.
8301       p.last_force_op_resend_preluminous = pending_inc.epoch;
8302       p.set_pg_num(n);
8303     } else {
8304       // set targets; mgr will adjust pg_num_actual and pgp_num later.
8305       // make pgp_num track pg_num if it already matches.  if it is set
8306       // differently, leave it different and let the user control it
8307       // manually.
8308       if (p.get_pg_num_target() == p.get_pgp_num_target()) {
8309         p.set_pgp_num_target(n);
8310       }
8311       p.set_pg_num_target(n);
8312     }
8313   } else if (var == "pgp_num_actual") {
8314     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8315       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8316       return -EPERM;
8317     }
8318     if (interr.length()) {
8319       ss << "error parsing integer value '" << val << "': " << interr;
8320       return -EINVAL;
8321     }
8322     if (n <= 0) {
8323       ss << "specified pgp_num must > 0, but you set to " << n;
8324       return -EINVAL;
8325     }
8326     if (n > (int)p.get_pg_num()) {
8327       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num();
8328       return -EINVAL;
8329     }
8330     if (n > (int)p.get_pg_num_pending()) {
8331       ss << "specified pgp_num " << n
8332          << " > pg_num_pending " << p.get_pg_num_pending();
8333       return -EINVAL;
8334     }
8335     p.set_pgp_num(n);
8336   } else if (var == "pgp_num") {
8337     if (p.has_flag(pg_pool_t::FLAG_NOPGCHANGE)) {
8338       ss << "pool pgp_num change is disabled; you must unset nopgchange flag for the pool first";
8339       return -EPERM;
8340     }
8341     if (interr.length()) {
8342       ss << "error parsing integer value '" << val << "': " << interr;
8343       return -EINVAL;
8344     }
8345     if (n <= 0) {
8346       ss << "specified pgp_num must > 0, but you set to " << n;
8347       return -EINVAL;
8348     }
8349     if (n > (int)p.get_pg_num_target()) {
8350       ss << "specified pgp_num " << n << " > pg_num " << p.get_pg_num_target();
8351       return -EINVAL;
8352     }
8353     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8354       // pre-nautilus osdmap format; increase pgp_num directly
8355       p.set_pgp_num(n);
8356     } else {
8357       p.set_pgp_num_target(n);
8358     }
8359   } else if (var == "pg_autoscale_mode") {
8360     auto m = pg_pool_t::get_pg_autoscale_mode_by_name(val);
8361     if (m == pg_pool_t::pg_autoscale_mode_t::UNKNOWN) {
8362       ss << "specified invalid mode " << val;
8363       return -EINVAL;
8364     }
8365     if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8366       ss << "must set require_osd_release to nautilus or later before setting pg_autoscale_mode";
8367       return -EINVAL;
8368     }
8369     p.pg_autoscale_mode = m;
8370   } else if (var == "crush_rule") {
8371     int id = osdmap.crush->get_rule_id(val);
8372     if (id == -ENOENT) {
8373       ss << "crush rule " << val << " does not exist";
8374       return -ENOENT;
8375     }
8376     if (id < 0) {
8377       ss << cpp_strerror(id);
8378       return -ENOENT;
8379     }
8380     if (!osdmap.crush->check_crush_rule(id, p.get_type(), p.get_size(), ss)) {
8381       return -EINVAL;
8382     }
8383     p.crush_rule = id;
8384   } else if (var == "nodelete" || var == "nopgchange" ||
8385              var == "nosizechange" || var == "write_fadvise_dontneed" ||
8386              var == "noscrub" || var == "nodeep-scrub") {
8387     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8388     // make sure we only compare against 'n' if we didn't receive a string
8389     if (val == "true" || (interr.empty() && n == 1)) {
8390       p.set_flag(flag);
8391     } else if (val == "false" || (interr.empty() && n == 0)) {
8392       p.unset_flag(flag);
8393     } else {
8394       ss << "expecting value 'true', 'false', '0', or '1'";
8395       return -EINVAL;
8396     }
8397   } else if (var == "hashpspool") {
8398     uint64_t flag = pg_pool_t::get_flag_by_name(var);
8399     bool force = false;
8400     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8401
8402     if (!force) {
8403       ss << "are you SURE?  this will remap all placement groups in this pool,"
8404             " this triggers large data movement,"
8405             " pass --yes-i-really-mean-it if you really do.";
8406       return -EPERM;
8407     }
8408     // make sure we only compare against 'n' if we didn't receive a string
8409     if (val == "true" || (interr.empty() && n == 1)) {
8410       p.set_flag(flag);
8411     } else if (val == "false" || (interr.empty() && n == 0)) {
8412       p.unset_flag(flag);
8413     } else {
8414       ss << "expecting value 'true', 'false', '0', or '1'";
8415       return -EINVAL;
8416     }
8417   } else if (var == "hit_set_type") {
8418     if (val == "none")
8419       p.hit_set_params = HitSet::Params();
8420     else {
8421       int err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
8422       if (err)
8423         return err;
8424       if (val == "bloom") {
8425         BloomHitSet::Params *bsp = new BloomHitSet::Params;
8426         bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
8427         p.hit_set_params = HitSet::Params(bsp);
8428       } else if (val == "explicit_hash")
8429         p.hit_set_params = HitSet::Params(new ExplicitHashHitSet::Params);
8430       else if (val == "explicit_object")
8431         p.hit_set_params = HitSet::Params(new ExplicitObjectHitSet::Params);
8432       else {
8433         ss << "unrecognized hit_set type '" << val << "'";
8434         return -EINVAL;
8435       }
8436     }
8437   } else if (var == "hit_set_period") {
8438     if (interr.length()) {
8439       ss << "error parsing integer value '" << val << "': " << interr;
8440       return -EINVAL;
8441     } else if (n < 0) {
8442       ss << "hit_set_period should be non-negative";
8443       return -EINVAL;
8444     }
8445     p.hit_set_period = n;
8446   } else if (var == "hit_set_count") {
8447     if (interr.length()) {
8448       ss << "error parsing integer value '" << val << "': " << interr;
8449       return -EINVAL;
8450     } else if (n < 0) {
8451       ss << "hit_set_count should be non-negative";
8452       return -EINVAL;
8453     }
8454     p.hit_set_count = n;
8455   } else if (var == "hit_set_fpp") {
8456     if (floaterr.length()) {
8457       ss << "error parsing floating point value '" << val << "': " << floaterr;
8458       return -EINVAL;
8459     } else if (f < 0 || f > 1.0) {
8460       ss << "hit_set_fpp should be in the range 0..1";
8461       return -EINVAL;
8462     }
8463     if (p.hit_set_params.get_type() != HitSet::TYPE_BLOOM) {
8464       ss << "hit set is not of type Bloom; invalid to set a false positive rate!";
8465       return -EINVAL;
8466     }
8467     BloomHitSet::Params *bloomp = static_cast<BloomHitSet::Params*>(p.hit_set_params.impl.get());
8468     bloomp->set_fpp(f);
8469   } else if (var == "use_gmt_hitset") {
8470     if (val == "true" || (interr.empty() && n == 1)) {
8471       p.use_gmt_hitset = true;
8472     } else {
8473       ss << "expecting value 'true' or '1'";
8474       return -EINVAL;
8475     }
8476   } else if (var == "allow_ec_overwrites") {
8477     if (!p.is_erasure()) {
8478       ss << "ec overwrites can only be enabled for an erasure coded pool";
8479       return -EINVAL;
8480     }
8481     stringstream err;
8482     if (!g_conf()->mon_debug_no_require_bluestore_for_ec_overwrites &&
8483         !is_pool_currently_all_bluestore(pool, p, &err)) {
8484       ss << "pool must only be stored on bluestore for scrubbing to work: " << err.str();
8485       return -EINVAL;
8486     }
8487     if (val == "true" || (interr.empty() && n == 1)) {
8488         p.flags |= pg_pool_t::FLAG_EC_OVERWRITES;
8489     } else if (val == "false" || (interr.empty() && n == 0)) {
8490       ss << "ec overwrites cannot be disabled once enabled";
8491       return -EINVAL;
8492     } else {
8493       ss << "expecting value 'true', 'false', '0', or '1'";
8494       return -EINVAL;
8495     }
8496   } else if (var == "target_max_objects") {
8497     if (interr.length()) {
8498       ss << "error parsing int '" << val << "': " << interr;
8499       return -EINVAL;
8500     }
8501     p.target_max_objects = n;
8502   } else if (var == "target_max_bytes") {
8503     if (interr.length()) {
8504       ss << "error parsing int '" << val << "': " << interr;
8505       return -EINVAL;
8506     }
8507     p.target_max_bytes = n;
8508   } else if (var == "cache_target_dirty_ratio") {
8509     if (floaterr.length()) {
8510       ss << "error parsing float '" << val << "': " << floaterr;
8511       return -EINVAL;
8512     }
8513     if (f < 0 || f > 1.0) {
8514       ss << "value must be in the range 0..1";
8515       return -ERANGE;
8516     }
8517     p.cache_target_dirty_ratio_micro = uf;
8518   } else if (var == "cache_target_dirty_high_ratio") {
8519     if (floaterr.length()) {
8520       ss << "error parsing float '" << val << "': " << floaterr;
8521       return -EINVAL;
8522     }
8523     if (f < 0 || f > 1.0) {
8524       ss << "value must be in the range 0..1";
8525       return -ERANGE;
8526     }
8527     p.cache_target_dirty_high_ratio_micro = uf;
8528   } else if (var == "cache_target_full_ratio") {
8529     if (floaterr.length()) {
8530       ss << "error parsing float '" << val << "': " << floaterr;
8531       return -EINVAL;
8532     }
8533     if (f < 0 || f > 1.0) {
8534       ss << "value must be in the range 0..1";
8535       return -ERANGE;
8536     }
8537     p.cache_target_full_ratio_micro = uf;
8538   } else if (var == "cache_min_flush_age") {
8539     if (interr.length()) {
8540       ss << "error parsing int '" << val << "': " << interr;
8541       return -EINVAL;
8542     }
8543     p.cache_min_flush_age = n;
8544   } else if (var == "cache_min_evict_age") {
8545     if (interr.length()) {
8546       ss << "error parsing int '" << val << "': " << interr;
8547       return -EINVAL;
8548     }
8549     p.cache_min_evict_age = n;
8550   } else if (var == "min_read_recency_for_promote") {
8551     if (interr.length()) {
8552       ss << "error parsing integer value '" << val << "': " << interr;
8553       return -EINVAL;
8554     }
8555     p.min_read_recency_for_promote = n;
8556   } else if (var == "hit_set_grade_decay_rate") {
8557     if (interr.length()) {
8558       ss << "error parsing integer value '" << val << "': " << interr;
8559       return -EINVAL;
8560     }
8561     if (n > 100 || n < 0) {
8562       ss << "value out of range,valid range is 0 - 100";
8563       return -EINVAL;
8564     }
8565     p.hit_set_grade_decay_rate = n;
8566   } else if (var == "hit_set_search_last_n") {
8567     if (interr.length()) {
8568       ss << "error parsing integer value '" << val << "': " << interr;
8569       return -EINVAL;
8570     }
8571     if (n > p.hit_set_count || n < 0) {
8572       ss << "value out of range,valid range is 0 - hit_set_count";
8573       return -EINVAL;
8574     }
8575     p.hit_set_search_last_n = n;
8576   } else if (var == "min_write_recency_for_promote") {
8577     if (interr.length()) {
8578       ss << "error parsing integer value '" << val << "': " << interr;
8579       return -EINVAL;
8580     }
8581     p.min_write_recency_for_promote = n;
8582   } else if (var == "fast_read") {
8583     if (p.is_replicated()) {
8584         ss << "fast read is not supported in replication pool";
8585         return -EINVAL;
8586     }
8587     if (val == "true" || (interr.empty() && n == 1)) {
8588       p.fast_read = true;
8589     } else if (val == "false" || (interr.empty() && n == 0)) {
8590       p.fast_read = false;
8591     } else {
8592       ss << "expecting value 'true', 'false', '0', or '1'";
8593       return -EINVAL;
8594     }
8595   } else if (pool_opts_t::is_opt_name(var)) {
8596     bool unset = val == "unset";
8597     if (var == "compression_mode") {
8598       if (!unset) {
8599         auto cmode = Compressor::get_comp_mode_type(val);
8600         if (!cmode) {
8601           ss << "unrecognized compression mode '" << val << "'";
8602           return -EINVAL;
8603         }
8604       }
8605     } else if (var == "compression_algorithm") {
8606       if (!unset) {
8607         auto alg = Compressor::get_comp_alg_type(val);
8608         if (!alg) {
8609           ss << "unrecognized compression_algorithm '" << val << "'";
8610           return -EINVAL;
8611         }
8612       }
8613     } else if (var == "compression_required_ratio") {
8614       if (floaterr.length()) {
8615         ss << "error parsing float value '" << val << "': " << floaterr;
8616         return -EINVAL;
8617       }
8618       if (f < 0 || f > 1) {
8619         ss << "compression_required_ratio is out of range (0-1): '" << val << "'";
8620         return -EINVAL;
8621       }
8622     } else if (var == "csum_type") {
8623       auto t = unset ? 0 : Checksummer::get_csum_string_type(val);
8624       if (t < 0 ) {
8625         ss << "unrecognized csum_type '" << val << "'";
8626         return -EINVAL;
8627       }
8628       //preserve csum_type numeric value
8629       n = t;
8630       interr.clear();
8631     } else if (var == "compression_max_blob_size" ||
8632                var == "compression_min_blob_size" ||
8633                var == "csum_max_block" ||
8634                var == "csum_min_block") {
8635       if (interr.length()) {
8636         ss << "error parsing int value '" << val << "': " << interr;
8637         return -EINVAL;
8638       }
8639     } else if (var == "fingerprint_algorithm") {
8640       if (!unset) {
8641         auto alg = pg_pool_t::get_fingerprint_from_str(val);
8642         if (!alg) {
8643           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8644           return -EINVAL;
8645         }
8646       }
8647     } else if (var == "target_size_bytes") {
8648       if (interr.length()) {
8649         ss << "error parsing unit value '" << val << "': " << interr;
8650         return -EINVAL;
8651       }
8652       if (osdmap.require_osd_release < ceph_release_t::nautilus) {
8653         ss << "must set require_osd_release to nautilus or "
8654            << "later before setting target_size_bytes";
8655         return -EINVAL;
8656       }
8657     } else if (var == "pg_num_min") {
8658       if (interr.length()) {
8659         ss << "error parsing int value '" << val << "': " << interr;
8660         return -EINVAL;
8661       }
8662       if (n > (int)p.get_pg_num_target()) {
8663         ss << "specified pg_num_min " << n
8664            << " > pg_num " << p.get_pg_num_target();
8665         return -EINVAL;
8666       }
8667     } else if (var == "recovery_priority") {
8668       if (interr.length()) {
8669         ss << "error parsing int value '" << val << "': " << interr;
8670         return -EINVAL;
8671       }
8672       if (!g_conf()->debug_allow_any_pool_priority) {
8673         if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) {
8674           ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN
8675              << " and " << OSD_POOL_PRIORITY_MAX;
8676           return -EINVAL;
8677         }
8678       }
8679     } else if (var == "pg_autoscale_bias") {
8680       if (f < 0.0 || f > 1000.0) {
8681         ss << "pg_autoscale_bias must be between 0 and 1000";
8682         return -EINVAL;
8683       }
8684     } else if (var == "dedup_tier") {
8685       if (interr.empty()) {
8686         ss << "expecting value 'pool name'";
8687         return -EINVAL;
8688       }
8689       // Current base tier in dedup does not support ec pool
8690       if (p.is_erasure()) {
8691         ss << "pool '" << poolstr
8692            << "' is an ec pool, which cannot be a base tier";
8693         return -ENOTSUP;
8694       }
8695       int64_t lowtierpool_id = osdmap.lookup_pg_pool_name(val);
8696       if (lowtierpool_id < 0) {
8697         ss << "unrecognized pool '" << val << "'";
8698         return -ENOENT;
8699       }
8700       const pg_pool_t *tp = osdmap.get_pg_pool(lowtierpool_id);
8701       ceph_assert(tp);
8702       n = lowtierpool_id;
8703       // The original input is string (pool name), but we convert it to int64_t.
8704       // So, clear interr
8705       interr.clear();
8706     } else if (var == "dedup_chunk_algorithm") {
8707       if (!unset) {
8708         auto alg = pg_pool_t::get_dedup_chunk_algorithm_from_str(val);
8709         if (!alg) {
8710           ss << "unrecognized fingerprint_algorithm '" << val << "'";
8711           return -EINVAL;
8712         }
8713       }
8714     } else if (var == "dedup_cdc_chunk_size") {
8715       if (interr.length()) {
8716         ss << "error parsing int value '" << val << "': " << interr;
8717         return -EINVAL;
8718       }
8719     }
8720
8721     pool_opts_t::opt_desc_t desc = pool_opts_t::get_opt_desc(var);
8722     switch (desc.type) {
8723     case pool_opts_t::STR:
8724       if (unset) {
8725         p.opts.unset(desc.key);
8726       } else {
8727         p.opts.set(desc.key, static_cast<std::string>(val));
8728       }
8729       break;
8730     case pool_opts_t::INT:
8731       if (interr.length()) {
8732         ss << "error parsing integer value '" << val << "': " << interr;
8733         return -EINVAL;
8734       }
8735       if (n == 0) {
8736         p.opts.unset(desc.key);
8737       } else {
8738         p.opts.set(desc.key, static_cast<int64_t>(n));
8739       }
8740       break;
8741     case pool_opts_t::DOUBLE:
8742       if (floaterr.length()) {
8743         ss << "error parsing floating point value '" << val << "': " << floaterr;
8744         return -EINVAL;
8745       }
8746       if (f == 0) {
8747         p.opts.unset(desc.key);
8748       } else {
8749         p.opts.set(desc.key, static_cast<double>(f));
8750       }
8751       break;
8752     default:
8753       ceph_assert(!"unknown type");
8754     }
8755   } else {
8756     ss << "unrecognized variable '" << var << "'";
8757     return -EINVAL;
8758   }
8759   if (val != "unset") {
8760     ss << "set pool " << pool << " " << var << " to " << val;
8761   } else {
8762     ss << "unset pool " << pool << " " << var;
8763   }
8764   p.last_change = pending_inc.epoch;
8765   pending_inc.new_pools[pool] = p;
8766   return 0;
8767 }
8768
8769 int OSDMonitor::prepare_command_pool_application(const string &prefix,
8770                                                  const cmdmap_t& cmdmap,
8771                                                  stringstream& ss)
8772 {
8773   return _command_pool_application(prefix, cmdmap, ss, nullptr, true);
8774 }
8775
8776 int OSDMonitor::preprocess_command_pool_application(const string &prefix,
8777                                                     const cmdmap_t& cmdmap,
8778                                                     stringstream& ss,
8779                                                     bool *modified)
8780 {
8781   return _command_pool_application(prefix, cmdmap, ss, modified, false);
8782 }
8783
8784
8785 /**
8786  * Common logic for preprocess and prepare phases of pool application
8787  * tag commands.  In preprocess mode we're only detecting invalid
8788  * commands, and determining whether it was a modification or a no-op.
8789  * In prepare mode we're actually updating the pending state.
8790  */
8791 int OSDMonitor::_command_pool_application(const string &prefix,
8792                                           const cmdmap_t& cmdmap,
8793                                           stringstream& ss,
8794                                           bool *modified,
8795                                           bool preparing)
8796 {
8797   string pool_name;
8798   cmd_getval(cmdmap, "pool", pool_name);
8799   int64_t pool = osdmap.lookup_pg_pool_name(pool_name.c_str());
8800   if (pool < 0) {
8801     ss << "unrecognized pool '" << pool_name << "'";
8802     return -ENOENT;
8803   }
8804
8805   pg_pool_t p = *osdmap.get_pg_pool(pool);
8806   if (preparing) {
8807     if (pending_inc.new_pools.count(pool)) {
8808       p = pending_inc.new_pools[pool];
8809     }
8810   }
8811
8812   string app;
8813   cmd_getval(cmdmap, "app", app);
8814   bool app_exists = (p.application_metadata.count(app) > 0);
8815
8816   string key;
8817   cmd_getval(cmdmap, "key", key);
8818   if (key == "all") {
8819     ss << "key cannot be 'all'";
8820     return -EINVAL;
8821   }
8822
8823   string value;
8824   cmd_getval(cmdmap, "value", value);
8825   if (value == "all") {
8826     ss << "value cannot be 'all'";
8827     return -EINVAL;
8828   }
8829
8830   if (boost::algorithm::ends_with(prefix, "enable")) {
8831     if (app.empty()) {
8832       ss << "application name must be provided";
8833       return -EINVAL;
8834     }
8835
8836     if (p.is_tier()) {
8837       ss << "application must be enabled on base tier";
8838       return -EINVAL;
8839     }
8840
8841     bool force = false;
8842     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8843
8844     if (!app_exists && !p.application_metadata.empty() && !force) {
8845       ss << "Are you SURE? Pool '" << pool_name << "' already has an enabled "
8846          << "application; pass --yes-i-really-mean-it to proceed anyway";
8847       return -EPERM;
8848     }
8849
8850     if (!app_exists && p.application_metadata.size() >= MAX_POOL_APPLICATIONS) {
8851       ss << "too many enabled applications on pool '" << pool_name << "'; "
8852          << "max " << MAX_POOL_APPLICATIONS;
8853       return -EINVAL;
8854     }
8855
8856     if (app.length() > MAX_POOL_APPLICATION_LENGTH) {
8857       ss << "application name '" << app << "' too long; max length "
8858          << MAX_POOL_APPLICATION_LENGTH;
8859       return -EINVAL;
8860     }
8861
8862     if (!app_exists) {
8863       p.application_metadata[app] = {};
8864     }
8865     ss << "enabled application '" << app << "' on pool '" << pool_name << "'";
8866
8867   } else if (boost::algorithm::ends_with(prefix, "disable")) {
8868     bool force = false;
8869     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
8870
8871     if (!force) {
8872       ss << "Are you SURE? Disabling an application within a pool might result "
8873          << "in loss of application functionality; pass "
8874          << "--yes-i-really-mean-it to proceed anyway";
8875       return -EPERM;
8876     }
8877
8878     if (!app_exists) {
8879       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8880          << "'";
8881       return 0; // idempotent
8882     }
8883
8884     p.application_metadata.erase(app);
8885     ss << "disable application '" << app << "' on pool '" << pool_name << "'";
8886
8887   } else if (boost::algorithm::ends_with(prefix, "set")) {
8888     if (p.is_tier()) {
8889       ss << "application metadata must be set on base tier";
8890       return -EINVAL;
8891     }
8892
8893     if (!app_exists) {
8894       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8895          << "'";
8896       return -ENOENT;
8897     }
8898
8899     string key;
8900     cmd_getval(cmdmap, "key", key);
8901
8902     if (key.empty()) {
8903       ss << "key must be provided";
8904       return -EINVAL;
8905     }
8906
8907     auto &app_keys = p.application_metadata[app];
8908     if (app_keys.count(key) == 0 &&
8909         app_keys.size() >= MAX_POOL_APPLICATION_KEYS) {
8910       ss << "too many keys set for application '" << app << "' on pool '"
8911          << pool_name << "'; max " << MAX_POOL_APPLICATION_KEYS;
8912       return -EINVAL;
8913     }
8914
8915     if (key.length() > MAX_POOL_APPLICATION_LENGTH) {
8916       ss << "key '" << app << "' too long; max length "
8917          << MAX_POOL_APPLICATION_LENGTH;
8918       return -EINVAL;
8919     }
8920
8921     string value;
8922     cmd_getval(cmdmap, "value", value);
8923     if (value.length() > MAX_POOL_APPLICATION_LENGTH) {
8924       ss << "value '" << value << "' too long; max length "
8925          << MAX_POOL_APPLICATION_LENGTH;
8926       return -EINVAL;
8927     }
8928
8929     p.application_metadata[app][key] = value;
8930     ss << "set application '" << app << "' key '" << key << "' to '"
8931        << value << "' on pool '" << pool_name << "'";
8932   } else if (boost::algorithm::ends_with(prefix, "rm")) {
8933     if (!app_exists) {
8934       ss << "application '" << app << "' is not enabled on pool '" << pool_name
8935          << "'";
8936       return -ENOENT;
8937     }
8938
8939     string key;
8940     cmd_getval(cmdmap, "key", key);
8941     auto it = p.application_metadata[app].find(key);
8942     if (it == p.application_metadata[app].end()) {
8943       ss << "application '" << app << "' on pool '" << pool_name
8944          << "' does not have key '" << key << "'";
8945       return 0; // idempotent
8946     }
8947
8948     p.application_metadata[app].erase(it);
8949     ss << "removed application '" << app << "' key '" << key << "' on pool '"
8950        << pool_name << "'";
8951   } else {
8952     ceph_abort();
8953   }
8954
8955   if (preparing) {
8956     p.last_change = pending_inc.epoch;
8957     pending_inc.new_pools[pool] = p;
8958   }
8959
8960   // Because we fell through this far, we didn't hit no-op cases,
8961   // so pool was definitely modified
8962   if (modified != nullptr) {
8963     *modified = true;
8964   }
8965
8966   return 0;
8967 }
8968
8969 int OSDMonitor::_prepare_command_osd_crush_remove(
8970     CrushWrapper &newcrush,
8971     int32_t id,
8972     int32_t ancestor,
8973     bool has_ancestor,
8974     bool unlink_only)
8975 {
8976   int err = 0;
8977
8978   if (has_ancestor) {
8979     err = newcrush.remove_item_under(cct, id, ancestor,
8980         unlink_only);
8981   } else {
8982     err = newcrush.remove_item(cct, id, unlink_only);
8983   }
8984   return err;
8985 }
8986
8987 void OSDMonitor::do_osd_crush_remove(CrushWrapper& newcrush)
8988 {
8989   pending_inc.crush.clear();
8990   newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
8991 }
8992
8993 int OSDMonitor::prepare_command_osd_crush_remove(
8994     CrushWrapper &newcrush,
8995     int32_t id,
8996     int32_t ancestor,
8997     bool has_ancestor,
8998     bool unlink_only)
8999 {
9000   int err = _prepare_command_osd_crush_remove(
9001       newcrush, id, ancestor,
9002       has_ancestor, unlink_only);
9003
9004   if (err < 0)
9005     return err;
9006
9007   ceph_assert(err == 0);
9008   do_osd_crush_remove(newcrush);
9009
9010   return 0;
9011 }
9012
9013 int OSDMonitor::prepare_command_osd_remove(int32_t id)
9014 {
9015   if (osdmap.is_up(id)) {
9016     return -EBUSY;
9017   }
9018
9019   pending_inc.new_state[id] = osdmap.get_state(id);
9020   pending_inc.new_uuid[id] = uuid_d();
9021   pending_metadata_rm.insert(id);
9022   pending_metadata.erase(id);
9023
9024   return 0;
9025 }
9026
9027 int32_t OSDMonitor::_allocate_osd_id(int32_t* existing_id)
9028 {
9029   ceph_assert(existing_id);
9030   *existing_id = -1;
9031
9032   for (int32_t i = 0; i < osdmap.get_max_osd(); ++i) {
9033     if (!osdmap.exists(i) &&
9034         pending_inc.new_up_client.count(i) == 0 &&
9035         (pending_inc.new_state.count(i) == 0 ||
9036          (pending_inc.new_state[i] & CEPH_OSD_EXISTS) == 0)) {
9037       *existing_id = i;
9038       return -1;
9039     }
9040   }
9041
9042   if (pending_inc.new_max_osd < 0) {
9043     return osdmap.get_max_osd();
9044   }
9045   return pending_inc.new_max_osd;
9046 }
9047
9048 void OSDMonitor::do_osd_create(
9049     const int32_t id,
9050     const uuid_d& uuid,
9051     const string& device_class,
9052     int32_t* new_id)
9053 {
9054   dout(10) << __func__ << " uuid " << uuid << dendl;
9055   ceph_assert(new_id);
9056
9057   // We presume validation has been performed prior to calling this
9058   // function. We assert with prejudice.
9059
9060   int32_t allocated_id = -1; // declare here so we can jump
9061   int32_t existing_id = -1;
9062   if (!uuid.is_zero()) {
9063     existing_id = osdmap.identify_osd(uuid);
9064     if (existing_id >= 0) {
9065       ceph_assert(id < 0 || id == existing_id);
9066       *new_id = existing_id;
9067       goto out;
9068     } else if (id >= 0) {
9069       // uuid does not exist, and id has been provided, so just create
9070       // the new osd.id
9071       *new_id = id;
9072       goto out;
9073     }
9074   }
9075
9076   // allocate a new id
9077   allocated_id = _allocate_osd_id(&existing_id);
9078   dout(10) << __func__ << " allocated id " << allocated_id
9079            << " existing id " << existing_id << dendl;
9080   if (existing_id >= 0) {
9081     ceph_assert(existing_id < osdmap.get_max_osd());
9082     ceph_assert(allocated_id < 0);
9083     *new_id = existing_id;
9084   } else if (allocated_id >= 0) {
9085     ceph_assert(existing_id < 0);
9086     // raise max_osd
9087     if (pending_inc.new_max_osd < 0) {
9088       pending_inc.new_max_osd = osdmap.get_max_osd() + 1;
9089     } else {
9090       ++pending_inc.new_max_osd;
9091     }
9092     *new_id = pending_inc.new_max_osd - 1;
9093     ceph_assert(*new_id == allocated_id);
9094   } else {
9095     ceph_abort_msg("unexpected condition");
9096   }
9097
9098 out:
9099   if (device_class.size()) {
9100     CrushWrapper newcrush;
9101     _get_pending_crush(newcrush);
9102     if (newcrush.get_max_devices() < *new_id + 1) {
9103       newcrush.set_max_devices(*new_id + 1);
9104     }
9105     string name = string("osd.") + stringify(*new_id);
9106     if (!newcrush.item_exists(*new_id)) {
9107       newcrush.set_item_name(*new_id, name);
9108     }
9109     ostringstream ss;
9110     int r = newcrush.update_device_class(*new_id, device_class, name, &ss);
9111     if (r < 0) {
9112       derr << __func__ << " failed to set " << name << " device_class "
9113            << device_class << ": " << cpp_strerror(r) << " - " << ss.str()
9114            << dendl;
9115       // non-fatal... this might be a replay and we want to be idempotent.
9116     } else {
9117       dout(20) << __func__ << " set " << name << " device_class " << device_class
9118                << dendl;
9119       pending_inc.crush.clear();
9120       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9121     }
9122   } else {
9123     dout(20) << __func__ << " no device_class" << dendl;
9124   }
9125
9126   dout(10) << __func__ << " using id " << *new_id << dendl;
9127   if (osdmap.get_max_osd() <= *new_id && pending_inc.new_max_osd <= *new_id) {
9128     pending_inc.new_max_osd = *new_id + 1;
9129   }
9130
9131   pending_inc.new_weight[*new_id] = CEPH_OSD_IN;
9132   // do not set EXISTS; OSDMap::set_weight, called by apply_incremental, will
9133   // set it for us.  (ugh.)
9134   pending_inc.new_state[*new_id] |= CEPH_OSD_NEW;
9135   if (!uuid.is_zero())
9136     pending_inc.new_uuid[*new_id] = uuid;
9137 }
9138
9139 int OSDMonitor::validate_osd_create(
9140     const int32_t id,
9141     const uuid_d& uuid,
9142     const bool check_osd_exists,
9143     int32_t* existing_id,
9144     stringstream& ss)
9145 {
9146
9147   dout(10) << __func__ << " id " << id << " uuid " << uuid
9148            << " check_osd_exists " << check_osd_exists << dendl;
9149
9150   ceph_assert(existing_id);
9151
9152   if (id < 0 && uuid.is_zero()) {
9153     // we have nothing to validate
9154     *existing_id = -1;
9155     return 0;
9156   } else if (uuid.is_zero()) {
9157     // we have an id but we will ignore it - because that's what
9158     // `osd create` does.
9159     return 0;
9160   }
9161
9162   /*
9163    * This function will be used to validate whether we are able to
9164    * create a new osd when the `uuid` is specified.
9165    *
9166    * It will be used by both `osd create` and `osd new`, as the checks
9167    * are basically the same when it pertains to osd id and uuid validation.
9168    * However, `osd create` presumes an `uuid` is optional, for legacy
9169    * reasons, while `osd new` requires the `uuid` to be provided. This
9170    * means that `osd create` will not be idempotent if an `uuid` is not
9171    * provided, but we will always guarantee the idempotency of `osd new`.
9172    */
9173
9174   ceph_assert(!uuid.is_zero());
9175   if (pending_inc.identify_osd(uuid) >= 0) {
9176     // osd is about to exist
9177     return -EAGAIN;
9178   }
9179
9180   int32_t i = osdmap.identify_osd(uuid);
9181   if (i >= 0) {
9182     // osd already exists
9183     if (id >= 0 && i != id) {
9184       ss << "uuid " << uuid << " already in use for different id " << i;
9185       return -EEXIST;
9186     }
9187     // return a positive errno to distinguish between a blocking error
9188     // and an error we consider to not be a problem (i.e., this would be
9189     // an idempotent operation).
9190     *existing_id = i;
9191     return EEXIST;
9192   }
9193   // i < 0
9194   if (id >= 0) {
9195     if (pending_inc.new_state.count(id)) {
9196       // osd is about to exist
9197       return -EAGAIN;
9198     }
9199     // we may not care if an osd exists if we are recreating a previously
9200     // destroyed osd.
9201     if (check_osd_exists && osdmap.exists(id)) {
9202       ss << "id " << id << " already in use and does not match uuid "
9203          << uuid;
9204       return -EINVAL;
9205     }
9206   }
9207   return 0;
9208 }
9209
9210 int OSDMonitor::prepare_command_osd_create(
9211     const int32_t id,
9212     const uuid_d& uuid,
9213     int32_t* existing_id,
9214     stringstream& ss)
9215 {
9216   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9217   ceph_assert(existing_id);
9218   if (osdmap.is_destroyed(id)) {
9219     ss << "ceph osd create has been deprecated. Please use ceph osd new "
9220           "instead.";
9221     return -EINVAL;
9222   }
9223
9224   if (uuid.is_zero()) {
9225     dout(10) << __func__ << " no uuid; assuming legacy `osd create`" << dendl;
9226   }
9227
9228   return validate_osd_create(id, uuid, true, existing_id, ss);
9229 }
9230
9231 int OSDMonitor::prepare_command_osd_new(
9232     MonOpRequestRef op,
9233     const cmdmap_t& cmdmap,
9234     const map<string,string>& params,
9235     stringstream &ss,
9236     Formatter *f)
9237 {
9238   uuid_d uuid;
9239   string uuidstr;
9240   int64_t id = -1;
9241
9242   ceph_assert(paxos.is_plugged());
9243
9244   dout(10) << __func__ << " " << op << dendl;
9245
9246   /* validate command. abort now if something's wrong. */
9247
9248   /* `osd new` will expect a `uuid` to be supplied; `id` is optional.
9249    *
9250    * If `id` is not specified, we will identify any existing osd based
9251    * on `uuid`. Operation will be idempotent iff secrets match.
9252    *
9253    * If `id` is specified, we will identify any existing osd based on
9254    * `uuid` and match against `id`. If they match, operation will be
9255    * idempotent iff secrets match.
9256    *
9257    * `-i secrets.json` will be optional. If supplied, will be used
9258    * to check for idempotency when `id` and `uuid` match.
9259    *
9260    * If `id` is not specified, and `uuid` does not exist, an id will
9261    * be found or allocated for the osd.
9262    *
9263    * If `id` is specified, and the osd has been previously marked
9264    * as destroyed, then the `id` will be reused.
9265    */
9266   if (!cmd_getval(cmdmap, "uuid", uuidstr)) {
9267     ss << "requires the OSD's UUID to be specified.";
9268     return -EINVAL;
9269   } else if (!uuid.parse(uuidstr.c_str())) {
9270     ss << "invalid UUID value '" << uuidstr << "'.";
9271     return -EINVAL;
9272   }
9273
9274   if (cmd_getval(cmdmap, "id", id) &&
9275       (id < 0)) {
9276     ss << "invalid OSD id; must be greater or equal than zero.";
9277     return -EINVAL;
9278   }
9279
9280   // are we running an `osd create`-like command, or recreating
9281   // a previously destroyed osd?
9282
9283   bool is_recreate_destroyed = (id >= 0 && osdmap.is_destroyed(id));
9284
9285   // we will care about `id` to assess whether osd is `destroyed`, or
9286   // to create a new osd.
9287   // we will need an `id` by the time we reach auth.
9288
9289   int32_t existing_id = -1;
9290   int err = validate_osd_create(id, uuid, !is_recreate_destroyed,
9291                                 &existing_id, ss);
9292
9293   bool may_be_idempotent = false;
9294   if (err == EEXIST) {
9295     // this is idempotent from the osdmon's point-of-view
9296     may_be_idempotent = true;
9297     ceph_assert(existing_id >= 0);
9298     id = existing_id;
9299   } else if (err < 0) {
9300     return err;
9301   }
9302
9303   if (!may_be_idempotent) {
9304     // idempotency is out of the window. We are either creating a new
9305     // osd or recreating a destroyed osd.
9306     //
9307     // We now need to figure out if we have an `id` (and if it's valid),
9308     // of find an `id` if we don't have one.
9309
9310     // NOTE: we need to consider the case where the `id` is specified for
9311     // `osd create`, and we must honor it. So this means checking if
9312     // the `id` is destroyed, and if so assume the destroy; otherwise,
9313     // check if it `exists` - in which case we complain about not being
9314     // `destroyed`. In the end, if nothing fails, we must allow the
9315     // creation, so that we are compatible with `create`.
9316     if (id >= 0 && osdmap.exists(id) && !osdmap.is_destroyed(id)) {
9317       dout(10) << __func__ << " osd." << id << " isn't destroyed" << dendl;
9318       ss << "OSD " << id << " has not yet been destroyed";
9319       return -EINVAL;
9320     } else if (id < 0) {
9321       // find an `id`
9322       id = _allocate_osd_id(&existing_id);
9323       if (id < 0) {
9324         ceph_assert(existing_id >= 0);
9325         id = existing_id;
9326       }
9327       dout(10) << __func__ << " found id " << id << " to use" << dendl;
9328     } else if (id >= 0 && osdmap.is_destroyed(id)) {
9329       dout(10) << __func__ << " recreating osd." << id << dendl;
9330     } else {
9331       dout(10) << __func__ << " creating new osd." << id << dendl;
9332     }
9333   } else {
9334     ceph_assert(id >= 0);
9335     ceph_assert(osdmap.exists(id));
9336   }
9337
9338   // we are now able to either create a brand new osd or reuse an existing
9339   // osd that has been previously destroyed.
9340
9341   dout(10) << __func__ << " id " << id << " uuid " << uuid << dendl;
9342
9343   if (may_be_idempotent && params.empty()) {
9344     // nothing to do, really.
9345     dout(10) << __func__ << " idempotent and no params -- no op." << dendl;
9346     ceph_assert(id >= 0);
9347     if (f) {
9348       f->open_object_section("created_osd");
9349       f->dump_int("osdid", id);
9350       f->close_section();
9351     } else {
9352       ss << id;
9353     }
9354     return EEXIST;
9355   }
9356
9357   string device_class;
9358   auto p = params.find("crush_device_class");
9359   if (p != params.end()) {
9360     device_class = p->second;
9361     dout(20) << __func__ << " device_class will be " << device_class << dendl;
9362   }
9363   string cephx_secret, lockbox_secret, dmcrypt_key;
9364   bool has_lockbox = false;
9365   bool has_secrets = params.count("cephx_secret")
9366     || params.count("cephx_lockbox_secret")
9367     || params.count("dmcrypt_key");
9368
9369   KVMonitor *svc = nullptr;
9370   AuthMonitor::auth_entity_t cephx_entity, lockbox_entity;
9371
9372   if (has_secrets) {
9373     if (params.count("cephx_secret") == 0) {
9374       ss << "requires a cephx secret.";
9375       return -EINVAL;
9376     }
9377     cephx_secret = params.at("cephx_secret");
9378
9379     bool has_lockbox_secret = (params.count("cephx_lockbox_secret") > 0);
9380     bool has_dmcrypt_key = (params.count("dmcrypt_key") > 0);
9381
9382     dout(10) << __func__ << " has lockbox " << has_lockbox_secret
9383              << " dmcrypt " << has_dmcrypt_key << dendl;
9384
9385     if (has_lockbox_secret && has_dmcrypt_key) {
9386       has_lockbox = true;
9387       lockbox_secret = params.at("cephx_lockbox_secret");
9388       dmcrypt_key = params.at("dmcrypt_key");
9389     } else if (!has_lockbox_secret != !has_dmcrypt_key) {
9390       ss << "requires both a cephx lockbox secret and a dm-crypt key.";
9391       return -EINVAL;
9392     }
9393
9394     dout(10) << __func__ << " validate secrets using osd id " << id << dendl;
9395
9396     err = mon.authmon()->validate_osd_new(id, uuid,
9397         cephx_secret,
9398         lockbox_secret,
9399         cephx_entity,
9400         lockbox_entity,
9401         ss);
9402     if (err < 0) {
9403       return err;
9404     } else if (may_be_idempotent && err != EEXIST) {
9405       // for this to be idempotent, `id` should already be >= 0; no need
9406       // to use validate_id.
9407       ceph_assert(id >= 0);
9408       ss << "osd." << id << " exists but secrets do not match";
9409       return -EEXIST;
9410     }
9411
9412     if (has_lockbox) {
9413       svc = mon.kvmon();
9414       err = svc->validate_osd_new(uuid, dmcrypt_key, ss);
9415       if (err < 0) {
9416         return err;
9417       } else if (may_be_idempotent && err != EEXIST) {
9418         ceph_assert(id >= 0);
9419         ss << "osd." << id << " exists but dm-crypt key does not match.";
9420         return -EEXIST;
9421       }
9422     }
9423   }
9424   ceph_assert(!has_secrets || !cephx_secret.empty());
9425   ceph_assert(!has_lockbox || !lockbox_secret.empty());
9426
9427   if (may_be_idempotent) {
9428     // we have nothing to do for either the osdmon or the authmon,
9429     // and we have no lockbox - so the config key service will not be
9430     // touched. This is therefore an idempotent operation, and we can
9431     // just return right away.
9432     dout(10) << __func__ << " idempotent -- no op." << dendl;
9433     ceph_assert(id >= 0);
9434     if (f) {
9435       f->open_object_section("created_osd");
9436       f->dump_int("osdid", id);
9437       f->close_section();
9438     } else {
9439       ss << id;
9440     }
9441     return EEXIST;
9442   }
9443   ceph_assert(!may_be_idempotent);
9444
9445   // perform updates.
9446   if (has_secrets) {
9447     ceph_assert(!cephx_secret.empty());
9448     ceph_assert((lockbox_secret.empty() && dmcrypt_key.empty()) ||
9449            (!lockbox_secret.empty() && !dmcrypt_key.empty()));
9450
9451     err = mon.authmon()->do_osd_new(cephx_entity,
9452         lockbox_entity,
9453         has_lockbox);
9454     ceph_assert(0 == err);
9455
9456     if (has_lockbox) {
9457       ceph_assert(nullptr != svc);
9458       svc->do_osd_new(uuid, dmcrypt_key);
9459     }
9460   }
9461
9462   if (is_recreate_destroyed) {
9463     ceph_assert(id >= 0);
9464     ceph_assert(osdmap.is_destroyed(id));
9465     pending_inc.new_state[id] |= CEPH_OSD_DESTROYED;
9466     if ((osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
9467       pending_inc.new_state[id] |= CEPH_OSD_NEW;
9468     }
9469     if (osdmap.get_state(id) & CEPH_OSD_UP) {
9470       // due to http://tracker.ceph.com/issues/20751 some clusters may
9471       // have UP set for non-existent OSDs; make sure it is cleared
9472       // for a newly created osd.
9473       pending_inc.new_state[id] |= CEPH_OSD_UP;
9474     }
9475     pending_inc.new_uuid[id] = uuid;
9476   } else {
9477     ceph_assert(id >= 0);
9478     int32_t new_id = -1;
9479     do_osd_create(id, uuid, device_class, &new_id);
9480     ceph_assert(new_id >= 0);
9481     ceph_assert(id == new_id);
9482   }
9483
9484   if (f) {
9485     f->open_object_section("created_osd");
9486     f->dump_int("osdid", id);
9487     f->close_section();
9488   } else {
9489     ss << id;
9490   }
9491
9492   return 0;
9493 }
9494
9495 bool OSDMonitor::prepare_command(MonOpRequestRef op)
9496 {
9497   op->mark_osdmon_event(__func__);
9498   auto m = op->get_req<MMonCommand>();
9499   stringstream ss;
9500   cmdmap_t cmdmap;
9501   if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
9502     string rs = ss.str();
9503     mon.reply_command(op, -EINVAL, rs, get_last_committed());
9504     return true;
9505   }
9506
9507   MonSession *session = op->get_session();
9508   if (!session) {
9509     derr << __func__ << " no session" << dendl;
9510     mon.reply_command(op, -EACCES, "access denied", get_last_committed());
9511     return true;
9512   }
9513
9514   return prepare_command_impl(op, cmdmap);
9515 }
9516
9517 static int parse_reweights(CephContext *cct,
9518                            const cmdmap_t& cmdmap,
9519                            const OSDMap& osdmap,
9520                            map<int32_t, uint32_t>* weights)
9521 {
9522   string weights_str;
9523   if (!cmd_getval(cmdmap, "weights", weights_str)) {
9524     return -EINVAL;
9525   }
9526   std::replace(begin(weights_str), end(weights_str), '\'', '"');
9527   json_spirit::mValue json_value;
9528   if (!json_spirit::read(weights_str, json_value)) {
9529     return -EINVAL;
9530   }
9531   if (json_value.type() != json_spirit::obj_type) {
9532     return -EINVAL;
9533   }
9534   const auto obj = json_value.get_obj();
9535   try {
9536     for (auto& osd_weight : obj) {
9537       auto osd_id = std::stoi(osd_weight.first);
9538       if (!osdmap.exists(osd_id)) {
9539         return -ENOENT;
9540       }
9541       if (osd_weight.second.type() != json_spirit::str_type) {
9542         return -EINVAL;
9543       }
9544       auto weight = std::stoul(osd_weight.second.get_str());
9545       weights->insert({osd_id, weight});
9546     }
9547   } catch (const std::logic_error& e) {
9548     return -EINVAL;
9549   }
9550   return 0;
9551 }
9552
9553 int OSDMonitor::prepare_command_osd_destroy(
9554     int32_t id,
9555     stringstream& ss)
9556 {
9557   ceph_assert(paxos.is_plugged());
9558
9559   // we check if the osd exists for the benefit of `osd purge`, which may
9560   // have previously removed the osd. If the osd does not exist, return
9561   // -ENOENT to convey this, and let the caller deal with it.
9562   //
9563   // we presume that all auth secrets and config keys were removed prior
9564   // to this command being called. if they exist by now, we also assume
9565   // they must have been created by some other command and do not pertain
9566   // to this non-existent osd.
9567   if (!osdmap.exists(id)) {
9568     dout(10) << __func__ << " osd." << id << " does not exist." << dendl;
9569     return -ENOENT;
9570   }
9571
9572   uuid_d uuid = osdmap.get_uuid(id);
9573   dout(10) << __func__ << " destroying osd." << id
9574            << " uuid " << uuid << dendl;
9575
9576   // if it has been destroyed, we assume our work here is done.
9577   if (osdmap.is_destroyed(id)) {
9578     ss << "destroyed osd." << id;
9579     return 0;
9580   }
9581
9582   EntityName cephx_entity, lockbox_entity;
9583   bool idempotent_auth = false, idempotent_cks = false;
9584
9585   int err = mon.authmon()->validate_osd_destroy(id, uuid,
9586                                                  cephx_entity,
9587                                                  lockbox_entity,
9588                                                  ss);
9589   if (err < 0) {
9590     if (err == -ENOENT) {
9591       idempotent_auth = true;
9592     } else {
9593       return err;
9594     }
9595   }
9596
9597   auto svc = mon.kvmon();
9598   err = svc->validate_osd_destroy(id, uuid);
9599   if (err < 0) {
9600     ceph_assert(err == -ENOENT);
9601     err = 0;
9602     idempotent_cks = true;
9603   }
9604
9605   if (!idempotent_auth) {
9606     err = mon.authmon()->do_osd_destroy(cephx_entity, lockbox_entity);
9607     ceph_assert(0 == err);
9608   }
9609
9610   if (!idempotent_cks) {
9611     svc->do_osd_destroy(id, uuid);
9612   }
9613
9614   pending_inc.new_state[id] = CEPH_OSD_DESTROYED;
9615   pending_inc.new_uuid[id] = uuid_d();
9616
9617   // we can only propose_pending() once per service, otherwise we'll be
9618   // defying PaxosService and all laws of nature. Therefore, as we may
9619   // be used during 'osd purge', let's keep the caller responsible for
9620   // proposing.
9621   ceph_assert(err == 0);
9622   return 0;
9623 }
9624
9625 int OSDMonitor::prepare_command_osd_purge(
9626     int32_t id,
9627     stringstream& ss)
9628 {
9629   ceph_assert(paxos.is_plugged());
9630   dout(10) << __func__ << " purging osd." << id << dendl;
9631
9632   ceph_assert(!osdmap.is_up(id));
9633
9634   /*
9635    * This may look a bit weird, but this is what's going to happen:
9636    *
9637    *  1. we make sure that removing from crush works
9638    *  2. we call `prepare_command_osd_destroy()`. If it returns an
9639    *     error, then we abort the whole operation, as no updates
9640    *     have been made. However, we this function will have
9641    *     side-effects, thus we need to make sure that all operations
9642    *     performed henceforth will *always* succeed.
9643    *  3. we call `prepare_command_osd_remove()`. Although this
9644    *     function can return an error, it currently only checks if the
9645    *     osd is up - and we have made sure that it is not so, so there
9646    *     is no conflict, and it is effectively an update.
9647    *  4. finally, we call `do_osd_crush_remove()`, which will perform
9648    *     the crush update we delayed from before.
9649    */
9650
9651   CrushWrapper newcrush;
9652   _get_pending_crush(newcrush);
9653
9654   bool may_be_idempotent = false;
9655
9656   int err = _prepare_command_osd_crush_remove(newcrush, id, 0, false, false);
9657   if (err == -ENOENT) {
9658     err = 0;
9659     may_be_idempotent = true;
9660   } else if (err < 0) {
9661     ss << "error removing osd." << id << " from crush";
9662     return err;
9663   }
9664
9665   // no point destroying the osd again if it has already been marked destroyed
9666   if (!osdmap.is_destroyed(id)) {
9667     err = prepare_command_osd_destroy(id, ss);
9668     if (err < 0) {
9669       if (err == -ENOENT) {
9670         err = 0;
9671       } else {
9672         return err;
9673       }
9674     } else {
9675       may_be_idempotent = false;
9676     }
9677   }
9678   ceph_assert(0 == err);
9679
9680   if (may_be_idempotent && !osdmap.exists(id)) {
9681     dout(10) << __func__ << " osd." << id << " does not exist and "
9682              << "we are idempotent." << dendl;
9683     return -ENOENT;
9684   }
9685
9686   err = prepare_command_osd_remove(id);
9687   // we should not be busy, as we should have made sure this id is not up.
9688   ceph_assert(0 == err);
9689
9690   do_osd_crush_remove(newcrush);
9691   return 0;
9692 }
9693
9694 bool OSDMonitor::prepare_command_impl(MonOpRequestRef op,
9695                                       const cmdmap_t& cmdmap)
9696 {
9697   op->mark_osdmon_event(__func__);
9698   auto m = op->get_req<MMonCommand>();
9699   bool ret = false;
9700   stringstream ss;
9701   string rs;
9702   bufferlist rdata;
9703   int err = 0;
9704
9705   string format;
9706   cmd_getval(cmdmap, "format", format, string("plain"));
9707   boost::scoped_ptr<Formatter> f(Formatter::create(format));
9708
9709   string prefix;
9710   cmd_getval(cmdmap, "prefix", prefix);
9711
9712   int64_t osdid;
9713   string osd_name;
9714   bool osdid_present = false;
9715   if (prefix != "osd pg-temp" &&
9716       prefix != "osd pg-upmap" &&
9717       prefix != "osd pg-upmap-items") {  // avoid commands with non-int id arg
9718     osdid_present = cmd_getval(cmdmap, "id", osdid);
9719   }
9720   if (osdid_present) {
9721     ostringstream oss;
9722     oss << "osd." << osdid;
9723     osd_name = oss.str();
9724   }
9725
9726   // Even if there's a pending state with changes that could affect
9727   // a command, considering that said state isn't yet committed, we
9728   // just don't care about those changes if the command currently being
9729   // handled acts as a no-op against the current committed state.
9730   // In a nutshell, we assume this command  happens *before*.
9731   //
9732   // Let me make this clearer:
9733   //
9734   //   - If we have only one client, and that client issues some
9735   //     operation that would conflict with this operation  but is
9736   //     still on the pending state, then we would be sure that said
9737   //     operation wouldn't have returned yet, so the client wouldn't
9738   //     issue this operation (unless the client didn't wait for the
9739   //     operation to finish, and that would be the client's own fault).
9740   //
9741   //   - If we have more than one client, each client will observe
9742   //     whatever is the state at the moment of the commit.  So, if we
9743   //     have two clients, one issuing an unlink and another issuing a
9744   //     link, and if the link happens while the unlink is still on the
9745   //     pending state, from the link's point-of-view this is a no-op.
9746   //     If different clients are issuing conflicting operations and
9747   //     they care about that, then the clients should make sure they
9748   //     enforce some kind of concurrency mechanism -- from our
9749   //     perspective that's what Douglas Adams would call an SEP.
9750   //
9751   // This should be used as a general guideline for most commands handled
9752   // in this function.  Adapt as you see fit, but please bear in mind that
9753   // this is the expected behavior.
9754
9755
9756   if (prefix == "osd setcrushmap" ||
9757       (prefix == "osd crush set" && !osdid_present)) {
9758     if (pending_inc.crush.length()) {
9759       dout(10) << __func__ << " waiting for pending crush update " << dendl;
9760       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
9761       return true;
9762     }
9763     dout(10) << "prepare_command setting new crush map" << dendl;
9764     bufferlist data(m->get_data());
9765     CrushWrapper crush;
9766     try {
9767       auto bl = data.cbegin();
9768       crush.decode(bl);
9769     }
9770     catch (const std::exception &e) {
9771       err = -EINVAL;
9772       ss << "Failed to parse crushmap: " << e.what();
9773       goto reply;
9774     }
9775
9776     int64_t prior_version = 0;
9777     if (cmd_getval(cmdmap, "prior_version", prior_version)) {
9778       if (prior_version == osdmap.get_crush_version() - 1) {
9779         // see if we are a resend of the last update.  this is imperfect
9780         // (multiple racing updaters may not both get reliable success)
9781         // but we expect crush updaters (via this interface) to be rare-ish.
9782         bufferlist current, proposed;
9783         osdmap.crush->encode(current, mon.get_quorum_con_features());
9784         crush.encode(proposed, mon.get_quorum_con_features());
9785         if (current.contents_equal(proposed)) {
9786           dout(10) << __func__
9787                    << " proposed matches current and version equals previous"
9788                    << dendl;
9789           err = 0;
9790           ss << osdmap.get_crush_version();
9791           goto reply;
9792         }
9793       }
9794       if (prior_version != osdmap.get_crush_version()) {
9795         err = -EPERM;
9796         ss << "prior_version " << prior_version << " != crush version "
9797            << osdmap.get_crush_version();
9798         goto reply;
9799       }
9800     }
9801
9802     if (crush.has_legacy_rule_ids()) {
9803       err = -EINVAL;
9804       ss << "crush maps with ruleset != ruleid are no longer allowed";
9805       goto reply;
9806     }
9807     if (!validate_crush_against_features(&crush, ss)) {
9808       err = -EINVAL;
9809       goto reply;
9810     }
9811
9812     err = osdmap.validate_crush_rules(&crush, &ss);
9813     if (err < 0) {
9814       goto reply;
9815     }
9816
9817     if (g_conf()->mon_osd_crush_smoke_test) {
9818       // sanity check: test some inputs to make sure this map isn't
9819       // totally broken
9820       dout(10) << " testing map" << dendl;
9821       stringstream ess;
9822       CrushTester tester(crush, ess);
9823       tester.set_min_x(0);
9824       tester.set_max_x(50);
9825       auto start = ceph::coarse_mono_clock::now();
9826       int r = tester.test_with_fork(g_conf()->mon_lease);
9827       auto duration = ceph::coarse_mono_clock::now() - start;
9828       if (r < 0) {
9829         dout(10) << " tester.test_with_fork returns " << r
9830                  << ": " << ess.str() << dendl;
9831         ss << "crush smoke test failed with " << r << ": " << ess.str();
9832         err = r;
9833         goto reply;
9834       }
9835       dout(10) << __func__ << " crush somke test duration: "
9836                << duration << ", result: " << ess.str() << dendl;
9837     }
9838
9839     pending_inc.crush = data;
9840     ss << osdmap.get_crush_version() + 1;
9841     goto update;
9842
9843   } else if (prefix == "osd crush set-all-straw-buckets-to-straw2") {
9844     CrushWrapper newcrush;
9845     _get_pending_crush(newcrush);
9846     for (int b = 0; b < newcrush.get_max_buckets(); ++b) {
9847       int bid = -1 - b;
9848       if (newcrush.bucket_exists(bid) &&
9849           newcrush.get_bucket_alg(bid) == CRUSH_BUCKET_STRAW) {
9850         dout(20) << " bucket " << bid << " is straw, can convert" << dendl;
9851         newcrush.bucket_set_alg(bid, CRUSH_BUCKET_STRAW2);
9852       }
9853     }
9854     if (!validate_crush_against_features(&newcrush, ss)) {
9855       err = -EINVAL;
9856       goto reply;
9857     }
9858     pending_inc.crush.clear();
9859     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9860     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
9861                                               get_last_committed() + 1));
9862     return true;
9863   } else if (prefix == "osd crush set-device-class") {
9864     string device_class;
9865     if (!cmd_getval(cmdmap, "class", device_class)) {
9866       err = -EINVAL; // no value!
9867       goto reply;
9868     }
9869
9870     bool stop = false;
9871     vector<string> idvec;
9872     cmd_getval(cmdmap, "ids", idvec);
9873     CrushWrapper newcrush;
9874     _get_pending_crush(newcrush);
9875     set<int> updated;
9876     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9877       set<int> osds;
9878       // wildcard?
9879       if (j == 0 &&
9880           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9881         osdmap.get_all_osds(osds);
9882         stop = true;
9883       } else {
9884         // try traditional single osd way
9885         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9886         if (osd < 0) {
9887           // ss has reason for failure
9888           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9889           err = -EINVAL;
9890           continue;
9891         }
9892         osds.insert(osd);
9893       }
9894
9895       for (auto &osd : osds) {
9896         if (!osdmap.exists(osd)) {
9897           ss << "osd." << osd << " does not exist. ";
9898           continue;
9899         }
9900
9901         ostringstream oss;
9902         oss << "osd." << osd;
9903         string name = oss.str();
9904
9905         if (newcrush.get_max_devices() < osd + 1) {
9906           newcrush.set_max_devices(osd + 1);
9907         }
9908         string action;
9909         if (newcrush.item_exists(osd)) {
9910           action = "updating";
9911         } else {
9912           action = "creating";
9913           newcrush.set_item_name(osd, name);
9914         }
9915
9916         dout(5) << action << " crush item id " << osd << " name '" << name
9917                 << "' device_class '" << device_class << "'"
9918                 << dendl;
9919         err = newcrush.update_device_class(osd, device_class, name, &ss);
9920         if (err < 0) {
9921           goto reply;
9922         }
9923         if (err == 0 && !_have_pending_crush()) {
9924           if (!stop) {
9925             // for single osd only, wildcard makes too much noise
9926             ss << "set-device-class item id " << osd << " name '" << name
9927                << "' device_class '" << device_class << "': no change. ";
9928           }
9929         } else {
9930           updated.insert(osd);
9931         }
9932       }
9933     }
9934
9935     pending_inc.crush.clear();
9936     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9937     ss << "set osd(s) " << updated << " to class '" << device_class << "'";
9938     getline(ss, rs);
9939     wait_for_finished_proposal(
9940       op,
9941       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
9942     return true;
9943  } else if (prefix == "osd crush rm-device-class") {
9944     bool stop = false;
9945     vector<string> idvec;
9946     cmd_getval(cmdmap, "ids", idvec);
9947     CrushWrapper newcrush;
9948     _get_pending_crush(newcrush);
9949     set<int> updated;
9950
9951     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
9952       set<int> osds;
9953
9954       // wildcard?
9955       if (j == 0 &&
9956           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
9957         osdmap.get_all_osds(osds);
9958         stop = true;
9959       } else {
9960         // try traditional single osd way
9961         long osd = parse_osd_id(idvec[j].c_str(), &ss);
9962         if (osd < 0) {
9963           // ss has reason for failure
9964           ss << ", unable to parse osd id:\"" << idvec[j] << "\". ";
9965           err = -EINVAL;
9966           goto reply;
9967         }
9968         osds.insert(osd);
9969       }
9970
9971       for (auto &osd : osds) {
9972         if (!osdmap.exists(osd)) {
9973           ss << "osd." << osd << " does not exist. ";
9974           continue;
9975         }
9976
9977         auto class_name = newcrush.get_item_class(osd);
9978         if (!class_name) {
9979           ss << "osd." << osd << " belongs to no class, ";
9980           continue;
9981         }
9982         // note that we do not verify if class_is_in_use here
9983         // in case the device is misclassified and user wants
9984         // to overridely reset...
9985
9986         err = newcrush.remove_device_class(cct, osd, &ss);
9987         if (err < 0) {
9988           // ss has reason for failure
9989           goto reply;
9990         }
9991         updated.insert(osd);
9992       }
9993     }
9994
9995     pending_inc.crush.clear();
9996     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
9997     ss << "done removing class of osd(s): " << updated;
9998     getline(ss, rs);
9999     wait_for_finished_proposal(
10000       op,
10001       new Monitor::C_Command(mon,op, 0, rs, get_last_committed() + 1));
10002     return true;
10003   } else if (prefix == "osd crush class create") {
10004     string device_class;
10005     if (!cmd_getval(cmdmap, "class", device_class)) {
10006       err = -EINVAL; // no value!
10007       goto reply;
10008     }
10009     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10010       ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10011          << "luminous' before using crush device classes";
10012       err = -EPERM;
10013       goto reply;
10014     }
10015     if (!_have_pending_crush() &&
10016         _get_stable_crush().class_exists(device_class)) {
10017       ss << "class '" << device_class << "' already exists";
10018       goto reply;
10019     }
10020      CrushWrapper newcrush;
10021     _get_pending_crush(newcrush);
10022      if (newcrush.class_exists(device_class)) {
10023       ss << "class '" << device_class << "' already exists";
10024       goto update;
10025     }
10026     int class_id = newcrush.get_or_create_class_id(device_class);
10027     pending_inc.crush.clear();
10028     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10029     ss << "created class " << device_class << " with id " << class_id
10030        << " to crush map";
10031     goto update;
10032   } else if (prefix == "osd crush class rm") {
10033     string device_class;
10034     if (!cmd_getval(cmdmap, "class", device_class)) {
10035        err = -EINVAL; // no value!
10036        goto reply;
10037      }
10038     if (osdmap.require_osd_release < ceph_release_t::luminous) {
10039        ss << "you must complete the upgrade and 'ceph osd require-osd-release "
10040          << "luminous' before using crush device classes";
10041        err = -EPERM;
10042        goto reply;
10043      }
10044
10045      if (!osdmap.crush->class_exists(device_class)) {
10046        err = 0;
10047        goto reply;
10048      }
10049
10050      CrushWrapper newcrush;
10051      _get_pending_crush(newcrush);
10052      if (!newcrush.class_exists(device_class)) {
10053        err = 0; // make command idempotent
10054        goto wait;
10055      }
10056      int class_id = newcrush.get_class_id(device_class);
10057      stringstream ts;
10058      if (newcrush.class_is_in_use(class_id, &ts)) {
10059        err = -EBUSY;
10060        ss << "class '" << device_class << "' " << ts.str();
10061        goto reply;
10062      }
10063
10064      // check if class is used by any erasure-code-profiles
10065      mempool::osdmap::map<string,map<string,string>> old_ec_profiles =
10066        osdmap.get_erasure_code_profiles();
10067      auto ec_profiles = pending_inc.get_erasure_code_profiles();
10068 #ifdef HAVE_STDLIB_MAP_SPLICING
10069      ec_profiles.merge(old_ec_profiles);
10070 #else
10071      ec_profiles.insert(make_move_iterator(begin(old_ec_profiles)),
10072                         make_move_iterator(end(old_ec_profiles)));
10073 #endif
10074      list<string> referenced_by;
10075      for (auto &i: ec_profiles) {
10076        for (auto &j: i.second) {
10077          if ("crush-device-class" == j.first && device_class == j.second) {
10078            referenced_by.push_back(i.first);
10079          }
10080        }
10081      }
10082      if (!referenced_by.empty()) {
10083        err = -EBUSY;
10084        ss << "class '" << device_class
10085           << "' is still referenced by erasure-code-profile(s): " << referenced_by;
10086        goto reply;
10087      }
10088
10089      set<int> osds;
10090      newcrush.get_devices_by_class(device_class, &osds);
10091      for (auto& p: osds) {
10092        err = newcrush.remove_device_class(g_ceph_context, p, &ss);
10093        if (err < 0) {
10094          // ss has reason for failure
10095          goto reply;
10096        }
10097      }
10098
10099      if (osds.empty()) {
10100        // empty class, remove directly
10101        err = newcrush.remove_class_name(device_class);
10102        if (err < 0) {
10103          ss << "class '" << device_class << "' cannot be removed '"
10104             << cpp_strerror(err) << "'";
10105          goto reply;
10106        }
10107      }
10108
10109      pending_inc.crush.clear();
10110      newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10111      ss << "removed class " << device_class << " with id " << class_id
10112         << " from crush map";
10113      goto update;
10114   } else if (prefix == "osd crush class rename") {
10115     string srcname, dstname;
10116     if (!cmd_getval(cmdmap, "srcname", srcname)) {
10117       err = -EINVAL;
10118       goto reply;
10119     }
10120     if (!cmd_getval(cmdmap, "dstname", dstname)) {
10121       err = -EINVAL;
10122       goto reply;
10123     }
10124
10125     CrushWrapper newcrush;
10126     _get_pending_crush(newcrush);
10127     if (!newcrush.class_exists(srcname) && newcrush.class_exists(dstname)) {
10128       // suppose this is a replay and return success
10129       // so command is idempotent
10130       ss << "already renamed to '" << dstname << "'";
10131       err = 0;
10132       goto reply;
10133     }
10134
10135     err = newcrush.rename_class(srcname, dstname);
10136     if (err < 0) {
10137       ss << "fail to rename '" << srcname << "' to '" << dstname << "' : "
10138          << cpp_strerror(err);
10139       goto reply;
10140     }
10141
10142     pending_inc.crush.clear();
10143     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10144     ss << "rename class '" << srcname << "' to '" << dstname << "'";
10145     goto update;
10146   } else if (prefix == "osd crush add-bucket") {
10147     // os crush add-bucket <name> <type>
10148     string name, typestr;
10149     vector<string> argvec;
10150     cmd_getval(cmdmap, "name", name);
10151     cmd_getval(cmdmap, "type", typestr);
10152     cmd_getval(cmdmap, "args", argvec);
10153     map<string,string> loc;
10154     if (!argvec.empty()) {
10155       CrushWrapper::parse_loc_map(argvec, &loc);
10156       dout(0) << "will create and move bucket '" << name
10157               << "' to location " << loc << dendl;
10158     }
10159
10160     if (!_have_pending_crush() &&
10161         _get_stable_crush().name_exists(name)) {
10162       ss << "bucket '" << name << "' already exists";
10163       goto reply;
10164     }
10165
10166     CrushWrapper newcrush;
10167     _get_pending_crush(newcrush);
10168
10169     if (newcrush.name_exists(name)) {
10170       ss << "bucket '" << name << "' already exists";
10171       goto update;
10172     }
10173     int type = newcrush.get_type_id(typestr);
10174     if (type < 0) {
10175       ss << "type '" << typestr << "' does not exist";
10176       err = -EINVAL;
10177       goto reply;
10178     }
10179     if (type == 0) {
10180       ss << "type '" << typestr << "' is for devices, not buckets";
10181       err = -EINVAL;
10182       goto reply;
10183     }
10184     int bucketno;
10185     err = newcrush.add_bucket(0, 0,
10186                               CRUSH_HASH_DEFAULT, type, 0, NULL,
10187                               NULL, &bucketno);
10188     if (err < 0) {
10189       ss << "add_bucket error: '" << cpp_strerror(err) << "'";
10190       goto reply;
10191     }
10192     err = newcrush.set_item_name(bucketno, name);
10193     if (err < 0) {
10194       ss << "error setting bucket name to '" << name << "'";
10195       goto reply;
10196     }
10197
10198     if (!loc.empty()) {
10199       if (!newcrush.check_item_loc(cct, bucketno, loc,
10200           (int *)NULL)) {
10201         err = newcrush.move_bucket(cct, bucketno, loc);
10202         if (err < 0) {
10203           ss << "error moving bucket '" << name << "' to location " << loc;
10204           goto reply;
10205         }
10206       } else {
10207         ss << "no need to move item id " << bucketno << " name '" << name
10208            << "' to location " << loc << " in crush map";
10209       }
10210     }
10211
10212     pending_inc.crush.clear();
10213     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10214     if (loc.empty()) {
10215       ss << "added bucket " << name << " type " << typestr
10216          << " to crush map";
10217     } else {
10218       ss << "added bucket " << name << " type " << typestr
10219          << " to location " << loc;
10220     }
10221     goto update;
10222   } else if (prefix == "osd crush rename-bucket") {
10223     string srcname, dstname;
10224     cmd_getval(cmdmap, "srcname", srcname);
10225     cmd_getval(cmdmap, "dstname", dstname);
10226
10227     err = crush_rename_bucket(srcname, dstname, &ss);
10228     if (err == -EALREADY) // equivalent to success for idempotency
10229       err = 0;
10230     if (err)
10231       goto reply;
10232     else
10233       goto update;
10234   } else if (prefix == "osd crush weight-set create" ||
10235              prefix == "osd crush weight-set create-compat") {
10236     CrushWrapper newcrush;
10237     _get_pending_crush(newcrush);
10238     int64_t pool;
10239     int positions;
10240     if (newcrush.has_non_straw2_buckets()) {
10241       ss << "crush map contains one or more bucket(s) that are not straw2";
10242       err = -EPERM;
10243       goto reply;
10244     }
10245     if (prefix == "osd crush weight-set create") {
10246       if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
10247           osdmap.require_min_compat_client < ceph_release_t::luminous) {
10248         ss << "require_min_compat_client "
10249            << osdmap.require_min_compat_client
10250            << " < luminous, which is required for per-pool weight-sets. "
10251            << "Try 'ceph osd set-require-min-compat-client luminous' "
10252            << "before using the new interface";
10253         err = -EPERM;
10254         goto reply;
10255       }
10256       string poolname, mode;
10257       cmd_getval(cmdmap, "pool", poolname);
10258       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10259       if (pool < 0) {
10260         ss << "pool '" << poolname << "' not found";
10261         err = -ENOENT;
10262         goto reply;
10263       }
10264       cmd_getval(cmdmap, "mode", mode);
10265       if (mode != "flat" && mode != "positional") {
10266         ss << "unrecognized weight-set mode '" << mode << "'";
10267         err = -EINVAL;
10268         goto reply;
10269       }
10270       positions = mode == "flat" ? 1 : osdmap.get_pg_pool(pool)->get_size();
10271     } else {
10272       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10273       positions = 1;
10274     }
10275     if (!newcrush.create_choose_args(pool, positions)) {
10276       if (pool == CrushWrapper::DEFAULT_CHOOSE_ARGS) {
10277         ss << "compat weight-set already created";
10278       } else {
10279         ss << "weight-set for pool '" << osdmap.get_pool_name(pool)
10280            << "' already created";
10281       }
10282       goto reply;
10283     }
10284     pending_inc.crush.clear();
10285     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10286     goto update;
10287
10288   } else if (prefix == "osd crush weight-set rm" ||
10289              prefix == "osd crush weight-set rm-compat") {
10290     CrushWrapper newcrush;
10291     _get_pending_crush(newcrush);
10292     int64_t pool;
10293     if (prefix == "osd crush weight-set rm") {
10294       string poolname;
10295       cmd_getval(cmdmap, "pool", poolname);
10296       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10297       if (pool < 0) {
10298         ss << "pool '" << poolname << "' not found";
10299         err = -ENOENT;
10300         goto reply;
10301       }
10302     } else {
10303       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10304     }
10305     newcrush.rm_choose_args(pool);
10306     pending_inc.crush.clear();
10307     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10308     goto update;
10309
10310   } else if (prefix == "osd crush weight-set reweight" ||
10311              prefix == "osd crush weight-set reweight-compat") {
10312     string poolname, item;
10313     vector<double> weight;
10314     cmd_getval(cmdmap, "pool", poolname);
10315     cmd_getval(cmdmap, "item", item);
10316     cmd_getval(cmdmap, "weight", weight);
10317     CrushWrapper newcrush;
10318     _get_pending_crush(newcrush);
10319     int64_t pool;
10320     if (prefix == "osd crush weight-set reweight") {
10321       pool = osdmap.lookup_pg_pool_name(poolname.c_str());
10322       if (pool < 0) {
10323         ss << "pool '" << poolname << "' not found";
10324         err = -ENOENT;
10325         goto reply;
10326       }
10327       if (!newcrush.have_choose_args(pool)) {
10328         ss << "no weight-set for pool '" << poolname << "'";
10329         err = -ENOENT;
10330         goto reply;
10331       }
10332       auto arg_map = newcrush.choose_args_get(pool);
10333       int positions = newcrush.get_choose_args_positions(arg_map);
10334       if (weight.size() != (size_t)positions) {
10335          ss << "must specify exact " << positions << " weight values";
10336          err = -EINVAL;
10337          goto reply;
10338       }
10339     } else {
10340       pool = CrushWrapper::DEFAULT_CHOOSE_ARGS;
10341       if (!newcrush.have_choose_args(pool)) {
10342         ss << "no backward-compatible weight-set";
10343         err = -ENOENT;
10344         goto reply;
10345       }
10346     }
10347     if (!newcrush.name_exists(item)) {
10348       ss << "item '" << item << "' does not exist";
10349       err = -ENOENT;
10350       goto reply;
10351     }
10352     err = newcrush.choose_args_adjust_item_weightf(
10353       cct,
10354       newcrush.choose_args_get(pool),
10355       newcrush.get_item_id(item),
10356       weight,
10357       &ss);
10358     if (err < 0) {
10359       goto reply;
10360     }
10361     err = 0;
10362     pending_inc.crush.clear();
10363     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10364     goto update;
10365   } else if (osdid_present &&
10366              (prefix == "osd crush set" || prefix == "osd crush add")) {
10367     // <OsdName> is 'osd.<id>' or '<id>', passed as int64_t id
10368     // osd crush set <OsdName> <weight> <loc1> [<loc2> ...]
10369     // osd crush add <OsdName> <weight> <loc1> [<loc2> ...]
10370
10371     if (!osdmap.exists(osdid)) {
10372       err = -ENOENT;
10373       ss << osd_name
10374          << " does not exist. Create it before updating the crush map";
10375       goto reply;
10376     }
10377
10378     double weight;
10379     if (!cmd_getval(cmdmap, "weight", weight)) {
10380       ss << "unable to parse weight value '"
10381          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10382       err = -EINVAL;
10383       goto reply;
10384     }
10385
10386     string args;
10387     vector<string> argvec;
10388     cmd_getval(cmdmap, "args", argvec);
10389     map<string,string> loc;
10390     CrushWrapper::parse_loc_map(argvec, &loc);
10391
10392     if (prefix == "osd crush set"
10393         && !_get_stable_crush().item_exists(osdid)) {
10394       err = -ENOENT;
10395       ss << "unable to set item id " << osdid << " name '" << osd_name
10396          << "' weight " << weight << " at location " << loc
10397          << ": does not exist";
10398       goto reply;
10399     }
10400
10401     dout(5) << "adding/updating crush item id " << osdid << " name '"
10402       << osd_name << "' weight " << weight << " at location "
10403       << loc << dendl;
10404     CrushWrapper newcrush;
10405     _get_pending_crush(newcrush);
10406
10407     string action;
10408     if (prefix == "osd crush set" ||
10409         newcrush.check_item_loc(cct, osdid, loc, (int *)NULL)) {
10410       action = "set";
10411       err = newcrush.update_item(cct, osdid, weight, osd_name, loc);
10412     } else {
10413       action = "add";
10414       err = newcrush.insert_item(cct, osdid, weight, osd_name, loc);
10415       if (err == 0)
10416         err = 1;
10417     }
10418
10419     if (err < 0)
10420       goto reply;
10421
10422     if (err == 0 && !_have_pending_crush()) {
10423       ss << action << " item id " << osdid << " name '" << osd_name
10424          << "' weight " << weight << " at location " << loc << ": no change";
10425       goto reply;
10426     }
10427
10428     pending_inc.crush.clear();
10429     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10430     ss << action << " item id " << osdid << " name '" << osd_name << "' weight "
10431        << weight << " at location " << loc << " to crush map";
10432     getline(ss, rs);
10433     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10434                                                       get_last_committed() + 1));
10435     return true;
10436
10437   } else if (prefix == "osd crush create-or-move") {
10438     do {
10439       // osd crush create-or-move <OsdName> <initial_weight> <loc1> [<loc2> ...]
10440       if (!osdmap.exists(osdid)) {
10441         err = -ENOENT;
10442         ss << osd_name
10443            << " does not exist.  create it before updating the crush map";
10444         goto reply;
10445       }
10446
10447       double weight;
10448       if (!cmd_getval(cmdmap, "weight", weight)) {
10449         ss << "unable to parse weight value '"
10450            << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10451         err = -EINVAL;
10452         goto reply;
10453       }
10454
10455       string args;
10456       vector<string> argvec;
10457       cmd_getval(cmdmap, "args", argvec);
10458       map<string,string> loc;
10459       CrushWrapper::parse_loc_map(argvec, &loc);
10460
10461       dout(0) << "create-or-move crush item name '" << osd_name
10462               << "' initial_weight " << weight << " at location " << loc
10463               << dendl;
10464
10465       CrushWrapper newcrush;
10466       _get_pending_crush(newcrush);
10467
10468       err = newcrush.create_or_move_item(cct, osdid, weight, osd_name, loc,
10469                                          g_conf()->osd_crush_update_weight_set);
10470       if (err == 0) {
10471         ss << "create-or-move updated item name '" << osd_name
10472            << "' weight " << weight
10473            << " at location " << loc << " to crush map";
10474         break;
10475       }
10476       if (err > 0) {
10477         pending_inc.crush.clear();
10478         newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10479         ss << "create-or-move updating item name '" << osd_name
10480            << "' weight " << weight
10481            << " at location " << loc << " to crush map";
10482         getline(ss, rs);
10483         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10484                                                   get_last_committed() + 1));
10485         return true;
10486       }
10487     } while (false);
10488
10489   } else if (prefix == "osd crush move") {
10490     do {
10491       // osd crush move <name> <loc1> [<loc2> ...]
10492       string name;
10493       vector<string> argvec;
10494       cmd_getval(cmdmap, "name", name);
10495       cmd_getval(cmdmap, "args", argvec);
10496       map<string,string> loc;
10497       CrushWrapper::parse_loc_map(argvec, &loc);
10498
10499       dout(0) << "moving crush item name '" << name << "' to location " << loc << dendl;
10500       CrushWrapper newcrush;
10501       _get_pending_crush(newcrush);
10502
10503       if (!newcrush.name_exists(name)) {
10504         err = -ENOENT;
10505         ss << "item " << name << " does not exist";
10506         break;
10507       }
10508       int id = newcrush.get_item_id(name);
10509
10510       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10511         if (id >= 0) {
10512           err = newcrush.create_or_move_item(
10513             cct, id, 0, name, loc,
10514             g_conf()->osd_crush_update_weight_set);
10515         } else {
10516           err = newcrush.move_bucket(cct, id, loc);
10517         }
10518         if (err >= 0) {
10519           ss << "moved item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10520           pending_inc.crush.clear();
10521           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10522           getline(ss, rs);
10523           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10524                                                    get_last_committed() + 1));
10525           return true;
10526         }
10527       } else {
10528         ss << "no need to move item id " << id << " name '" << name << "' to location " << loc << " in crush map";
10529         err = 0;
10530       }
10531     } while (false);
10532   } else if (prefix == "osd crush swap-bucket") {
10533     string source, dest;
10534     cmd_getval(cmdmap, "source", source);
10535     cmd_getval(cmdmap, "dest", dest);
10536
10537     bool force = false;
10538     cmd_getval(cmdmap, "yes_i_really_mean_it", force);
10539
10540     CrushWrapper newcrush;
10541     _get_pending_crush(newcrush);
10542     if (!newcrush.name_exists(source)) {
10543       ss << "source item " << source << " does not exist";
10544       err = -ENOENT;
10545       goto reply;
10546     }
10547     if (!newcrush.name_exists(dest)) {
10548       ss << "dest item " << dest << " does not exist";
10549       err = -ENOENT;
10550       goto reply;
10551     }
10552     int sid = newcrush.get_item_id(source);
10553     int did = newcrush.get_item_id(dest);
10554     int sparent;
10555     if (newcrush.get_immediate_parent_id(sid, &sparent) == 0 && !force) {
10556       ss << "source item " << source << " is not an orphan bucket; pass --yes-i-really-mean-it to proceed anyway";
10557       err = -EPERM;
10558       goto reply;
10559     }
10560     if (newcrush.get_bucket_alg(sid) != newcrush.get_bucket_alg(did) &&
10561         !force) {
10562       ss << "source bucket alg " << crush_alg_name(newcrush.get_bucket_alg(sid)) << " != "
10563          << "dest bucket alg " << crush_alg_name(newcrush.get_bucket_alg(did))
10564          << "; pass --yes-i-really-mean-it to proceed anyway";
10565       err = -EPERM;
10566       goto reply;
10567     }
10568     int r = newcrush.swap_bucket(cct, sid, did);
10569     if (r < 0) {
10570       ss << "failed to swap bucket contents: " << cpp_strerror(r);
10571       err = r;
10572       goto reply;
10573     }
10574     ss << "swapped bucket of " << source << " to " << dest;
10575     pending_inc.crush.clear();
10576     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10577     wait_for_finished_proposal(op,
10578                                new Monitor::C_Command(mon, op, err, ss.str(),
10579                                                       get_last_committed() + 1));
10580     return true;
10581   } else if (prefix == "osd crush link") {
10582     // osd crush link <name> <loc1> [<loc2> ...]
10583     string name;
10584     cmd_getval(cmdmap, "name", name);
10585     vector<string> argvec;
10586     cmd_getval(cmdmap, "args", argvec);
10587     map<string,string> loc;
10588     CrushWrapper::parse_loc_map(argvec, &loc);
10589
10590     // Need an explicit check for name_exists because get_item_id returns
10591     // 0 on unfound.
10592     int id = osdmap.crush->get_item_id(name);
10593     if (!osdmap.crush->name_exists(name)) {
10594       err = -ENOENT;
10595       ss << "item " << name << " does not exist";
10596       goto reply;
10597     } else {
10598       dout(5) << "resolved crush name '" << name << "' to id " << id << dendl;
10599     }
10600     if (osdmap.crush->check_item_loc(cct, id, loc, (int*) NULL)) {
10601       ss << "no need to move item id " << id << " name '" << name
10602          << "' to location " << loc << " in crush map";
10603       err = 0;
10604       goto reply;
10605     }
10606
10607     dout(5) << "linking crush item name '" << name << "' at location " << loc << dendl;
10608     CrushWrapper newcrush;
10609     _get_pending_crush(newcrush);
10610
10611     if (!newcrush.name_exists(name)) {
10612       err = -ENOENT;
10613       ss << "item " << name << " does not exist";
10614       goto reply;
10615     } else {
10616       int id = newcrush.get_item_id(name);
10617       if (!newcrush.check_item_loc(cct, id, loc, (int *)NULL)) {
10618         err = newcrush.link_bucket(cct, id, loc);
10619         if (err >= 0) {
10620           ss << "linked item id " << id << " name '" << name
10621              << "' to location " << loc << " in crush map";
10622           pending_inc.crush.clear();
10623           newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10624         } else {
10625           ss << "cannot link item id " << id << " name '" << name
10626              << "' to location " << loc;
10627           goto reply;
10628         }
10629       } else {
10630         ss << "no need to move item id " << id << " name '" << name
10631            << "' to location " << loc << " in crush map";
10632         err = 0;
10633       }
10634     }
10635     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, ss.str(),
10636                                               get_last_committed() + 1));
10637     return true;
10638   } else if (prefix == "osd crush rm" ||
10639              prefix == "osd crush remove" ||
10640              prefix == "osd crush unlink") {
10641     do {
10642       // osd crush rm <id> [ancestor]
10643       CrushWrapper newcrush;
10644       _get_pending_crush(newcrush);
10645
10646       string name;
10647       cmd_getval(cmdmap, "name", name);
10648
10649       if (!osdmap.crush->name_exists(name)) {
10650         err = 0;
10651         ss << "device '" << name << "' does not appear in the crush map";
10652         break;
10653       }
10654       if (!newcrush.name_exists(name)) {
10655         err = 0;
10656         ss << "device '" << name << "' does not appear in the crush map";
10657         getline(ss, rs);
10658         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10659                                                   get_last_committed() + 1));
10660         return true;
10661       }
10662       int id = newcrush.get_item_id(name);
10663       int ancestor = 0;
10664
10665       bool unlink_only = prefix == "osd crush unlink";
10666       string ancestor_str;
10667       if (cmd_getval(cmdmap, "ancestor", ancestor_str)) {
10668         if (!newcrush.name_exists(ancestor_str)) {
10669           err = -ENOENT;
10670           ss << "ancestor item '" << ancestor_str
10671              << "' does not appear in the crush map";
10672           break;
10673         }
10674         ancestor = newcrush.get_item_id(ancestor_str);
10675       }
10676
10677       err = prepare_command_osd_crush_remove(
10678           newcrush,
10679           id, ancestor,
10680           (ancestor < 0), unlink_only);
10681
10682       if (err == -ENOENT) {
10683         ss << "item " << id << " does not appear in that position";
10684         err = 0;
10685         break;
10686       }
10687       if (err == 0) {
10688         if (!unlink_only)
10689           pending_inc.new_crush_node_flags[id] = 0;
10690         ss << "removed item id " << id << " name '" << name << "' from crush map";
10691         getline(ss, rs);
10692         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10693                                                   get_last_committed() + 1));
10694         return true;
10695       }
10696     } while (false);
10697
10698   } else if (prefix == "osd crush reweight-all") {
10699     CrushWrapper newcrush;
10700     _get_pending_crush(newcrush);
10701
10702     newcrush.reweight(cct);
10703     pending_inc.crush.clear();
10704     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10705     ss << "reweighted crush hierarchy";
10706     getline(ss, rs);
10707     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10708                                                   get_last_committed() + 1));
10709     return true;
10710   } else if (prefix == "osd crush reweight") {
10711     // osd crush reweight <name> <weight>
10712     CrushWrapper newcrush;
10713     _get_pending_crush(newcrush);
10714
10715     string name;
10716     cmd_getval(cmdmap, "name", name);
10717     if (!newcrush.name_exists(name)) {
10718       err = -ENOENT;
10719       ss << "device '" << name << "' does not appear in the crush map";
10720       goto reply;
10721     }
10722
10723     int id = newcrush.get_item_id(name);
10724     if (id < 0) {
10725       ss << "device '" << name << "' is not a leaf in the crush map";
10726       err = -EINVAL;
10727       goto reply;
10728     }
10729     double w;
10730     if (!cmd_getval(cmdmap, "weight", w)) {
10731       ss << "unable to parse weight value '"
10732          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10733       err = -EINVAL;
10734       goto reply;
10735     }
10736
10737     err = newcrush.adjust_item_weightf(cct, id, w,
10738                                        g_conf()->osd_crush_update_weight_set);
10739     if (err < 0)
10740       goto reply;
10741     pending_inc.crush.clear();
10742     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10743     ss << "reweighted item id " << id << " name '" << name << "' to " << w
10744        << " in crush map";
10745     getline(ss, rs);
10746     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10747                                                   get_last_committed() + 1));
10748     return true;
10749   } else if (prefix == "osd crush reweight-subtree") {
10750     // osd crush reweight <name> <weight>
10751     CrushWrapper newcrush;
10752     _get_pending_crush(newcrush);
10753
10754     string name;
10755     cmd_getval(cmdmap, "name", name);
10756     if (!newcrush.name_exists(name)) {
10757       err = -ENOENT;
10758       ss << "device '" << name << "' does not appear in the crush map";
10759       goto reply;
10760     }
10761
10762     int id = newcrush.get_item_id(name);
10763     if (id >= 0) {
10764       ss << "device '" << name << "' is not a subtree in the crush map";
10765       err = -EINVAL;
10766       goto reply;
10767     }
10768     double w;
10769     if (!cmd_getval(cmdmap, "weight", w)) {
10770       ss << "unable to parse weight value '"
10771          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
10772       err = -EINVAL;
10773       goto reply;
10774     }
10775
10776     err = newcrush.adjust_subtree_weightf(cct, id, w,
10777                                           g_conf()->osd_crush_update_weight_set);
10778     if (err < 0)
10779       goto reply;
10780     pending_inc.crush.clear();
10781     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10782     ss << "reweighted subtree id " << id << " name '" << name << "' to " << w
10783        << " in crush map";
10784     getline(ss, rs);
10785     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10786                                               get_last_committed() + 1));
10787     return true;
10788   } else if (prefix == "osd crush tunables") {
10789     CrushWrapper newcrush;
10790     _get_pending_crush(newcrush);
10791
10792     err = 0;
10793     string profile;
10794     cmd_getval(cmdmap, "profile", profile);
10795     if (profile == "legacy" || profile == "argonaut") {
10796       newcrush.set_tunables_legacy();
10797     } else if (profile == "bobtail") {
10798       newcrush.set_tunables_bobtail();
10799     } else if (profile == "firefly") {
10800       newcrush.set_tunables_firefly();
10801     } else if (profile == "hammer") {
10802       newcrush.set_tunables_hammer();
10803     } else if (profile == "jewel") {
10804       newcrush.set_tunables_jewel();
10805     } else if (profile == "optimal") {
10806       newcrush.set_tunables_optimal();
10807     } else if (profile == "default") {
10808       newcrush.set_tunables_default();
10809     } else {
10810       ss << "unrecognized profile '" << profile << "'";
10811       err = -EINVAL;
10812       goto reply;
10813     }
10814
10815     if (!validate_crush_against_features(&newcrush, ss)) {
10816       err = -EINVAL;
10817       goto reply;
10818     }
10819
10820     pending_inc.crush.clear();
10821     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10822     ss << "adjusted tunables profile to " << profile;
10823     getline(ss, rs);
10824     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10825                                               get_last_committed() + 1));
10826     return true;
10827   } else if (prefix == "osd crush set-tunable") {
10828     CrushWrapper newcrush;
10829     _get_pending_crush(newcrush);
10830
10831     err = 0;
10832     string tunable;
10833     cmd_getval(cmdmap, "tunable", tunable);
10834
10835     int64_t value = -1;
10836     if (!cmd_getval(cmdmap, "value", value)) {
10837       err = -EINVAL;
10838       ss << "failed to parse integer value "
10839          << cmd_vartype_stringify(cmdmap.at("value"));
10840       goto reply;
10841     }
10842
10843     if (tunable == "straw_calc_version") {
10844       if (value != 0 && value != 1) {
10845         ss << "value must be 0 or 1; got " << value;
10846         err = -EINVAL;
10847         goto reply;
10848       }
10849       newcrush.set_straw_calc_version(value);
10850     } else {
10851       ss << "unrecognized tunable '" << tunable << "'";
10852       err = -EINVAL;
10853       goto reply;
10854     }
10855
10856     if (!validate_crush_against_features(&newcrush, ss)) {
10857       err = -EINVAL;
10858       goto reply;
10859     }
10860
10861     pending_inc.crush.clear();
10862     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10863     ss << "adjusted tunable " << tunable << " to " << value;
10864     getline(ss, rs);
10865     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10866                                               get_last_committed() + 1));
10867     return true;
10868
10869   } else if (prefix == "osd crush rule create-simple") {
10870     string name, root, type, mode;
10871     cmd_getval(cmdmap, "name", name);
10872     cmd_getval(cmdmap, "root", root);
10873     cmd_getval(cmdmap, "type", type);
10874     cmd_getval(cmdmap, "mode", mode);
10875     if (mode == "")
10876       mode = "firstn";
10877
10878     if (osdmap.crush->rule_exists(name)) {
10879       // The name is uniquely associated to a ruleid and the rule it contains
10880       // From the user point of view, the rule is more meaningfull.
10881       ss << "rule " << name << " already exists";
10882       err = 0;
10883       goto reply;
10884     }
10885
10886     CrushWrapper newcrush;
10887     _get_pending_crush(newcrush);
10888
10889     if (newcrush.rule_exists(name)) {
10890       // The name is uniquely associated to a ruleid and the rule it contains
10891       // From the user point of view, the rule is more meaningfull.
10892       ss << "rule " << name << " already exists";
10893       err = 0;
10894     } else {
10895       int ruleno = newcrush.add_simple_rule(name, root, type, "", mode,
10896                                                pg_pool_t::TYPE_REPLICATED, &ss);
10897       if (ruleno < 0) {
10898         err = ruleno;
10899         goto reply;
10900       }
10901
10902       pending_inc.crush.clear();
10903       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10904     }
10905     getline(ss, rs);
10906     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10907                                               get_last_committed() + 1));
10908     return true;
10909
10910   } else if (prefix == "osd crush rule create-replicated") {
10911     string name, root, type, device_class;
10912     cmd_getval(cmdmap, "name", name);
10913     cmd_getval(cmdmap, "root", root);
10914     cmd_getval(cmdmap, "type", type);
10915     cmd_getval(cmdmap, "class", device_class);
10916
10917     if (osdmap.crush->rule_exists(name)) {
10918       // The name is uniquely associated to a ruleid and the rule it contains
10919       // From the user point of view, the rule is more meaningfull.
10920       ss << "rule " << name << " already exists";
10921       err = 0;
10922       goto reply;
10923     }
10924
10925     CrushWrapper newcrush;
10926     _get_pending_crush(newcrush);
10927
10928     if (newcrush.rule_exists(name)) {
10929       // The name is uniquely associated to a ruleid and the rule it contains
10930       // From the user point of view, the rule is more meaningfull.
10931       ss << "rule " << name << " already exists";
10932       err = 0;
10933     } else {
10934       int ruleno = newcrush.add_simple_rule(
10935         name, root, type, device_class,
10936         "firstn", pg_pool_t::TYPE_REPLICATED, &ss);
10937       if (ruleno < 0) {
10938         err = ruleno;
10939         goto reply;
10940       }
10941
10942       pending_inc.crush.clear();
10943       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
10944     }
10945     getline(ss, rs);
10946     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10947                                               get_last_committed() + 1));
10948     return true;
10949
10950   } else if (prefix == "osd erasure-code-profile rm") {
10951     string name;
10952     cmd_getval(cmdmap, "name", name);
10953
10954     if (erasure_code_profile_in_use(pending_inc.new_pools, name, &ss))
10955       goto wait;
10956
10957     if (erasure_code_profile_in_use(osdmap.pools, name, &ss)) {
10958       err = -EBUSY;
10959       goto reply;
10960     }
10961
10962     if (osdmap.has_erasure_code_profile(name) ||
10963         pending_inc.new_erasure_code_profiles.count(name)) {
10964       if (osdmap.has_erasure_code_profile(name)) {
10965         pending_inc.old_erasure_code_profiles.push_back(name);
10966       } else {
10967         dout(20) << "erasure code profile rm " << name << ": creation canceled" << dendl;
10968         pending_inc.new_erasure_code_profiles.erase(name);
10969       }
10970
10971       getline(ss, rs);
10972       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
10973                                                         get_last_committed() + 1));
10974       return true;
10975     } else {
10976       ss << "erasure-code-profile " << name << " does not exist";
10977       err = 0;
10978       goto reply;
10979     }
10980
10981   } else if (prefix == "osd erasure-code-profile set") {
10982     string name;
10983     cmd_getval(cmdmap, "name", name);
10984     vector<string> profile;
10985     cmd_getval(cmdmap, "profile", profile);
10986
10987     bool force = false;
10988     cmd_getval(cmdmap, "force", force);
10989
10990     map<string,string> profile_map;
10991     err = parse_erasure_code_profile(profile, &profile_map, &ss);
10992     if (err)
10993       goto reply;
10994     if (auto found = profile_map.find("crush-failure-domain");
10995         found != profile_map.end()) {
10996       const auto& failure_domain = found->second;
10997       int failure_domain_type = osdmap.crush->get_type_id(failure_domain);
10998       if (failure_domain_type < 0) {
10999         ss << "erasure-code-profile " << profile_map
11000           << " contains an invalid failure-domain " << std::quoted(failure_domain);
11001         err = -EINVAL;
11002         goto reply;
11003       }
11004     }
11005
11006     if (profile_map.find("plugin") == profile_map.end()) {
11007       ss << "erasure-code-profile " << profile_map
11008          << " must contain a plugin entry" << std::endl;
11009       err = -EINVAL;
11010       goto reply;
11011     }
11012     string plugin = profile_map["plugin"];
11013
11014     if (pending_inc.has_erasure_code_profile(name)) {
11015       dout(20) << "erasure code profile " << name << " try again" << dendl;
11016       goto wait;
11017     } else {
11018       err = normalize_profile(name, profile_map, force, &ss);
11019       if (err)
11020         goto reply;
11021
11022       if (osdmap.has_erasure_code_profile(name)) {
11023         ErasureCodeProfile existing_profile_map =
11024           osdmap.get_erasure_code_profile(name);
11025         err = normalize_profile(name, existing_profile_map, force, &ss);
11026         if (err)
11027           goto reply;
11028
11029         if (existing_profile_map == profile_map) {
11030           err = 0;
11031           goto reply;
11032         }
11033         if (!force) {
11034           err = -EPERM;
11035           ss << "will not override erasure code profile " << name
11036              << " because the existing profile "
11037              << existing_profile_map
11038              << " is different from the proposed profile "
11039              << profile_map;
11040           goto reply;
11041         }
11042       }
11043
11044       dout(20) << "erasure code profile set " << name << "="
11045                << profile_map << dendl;
11046       pending_inc.set_erasure_code_profile(name, profile_map);
11047     }
11048
11049     getline(ss, rs);
11050     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11051                                                       get_last_committed() + 1));
11052     return true;
11053
11054   } else if (prefix == "osd crush rule create-erasure") {
11055     err = check_cluster_features(CEPH_FEATURE_CRUSH_V2, ss);
11056     if (err == -EAGAIN)
11057       goto wait;
11058     if (err)
11059       goto reply;
11060     string name, poolstr;
11061     cmd_getval(cmdmap, "name", name);
11062     string profile;
11063     cmd_getval(cmdmap, "profile", profile);
11064     if (profile == "")
11065       profile = "default";
11066     if (profile == "default") {
11067       if (!osdmap.has_erasure_code_profile(profile)) {
11068         if (pending_inc.has_erasure_code_profile(profile)) {
11069           dout(20) << "erasure code profile " << profile << " already pending" << dendl;
11070           goto wait;
11071         }
11072
11073         map<string,string> profile_map;
11074         err = osdmap.get_erasure_code_profile_default(cct,
11075                                                       profile_map,
11076                                                       &ss);
11077         if (err)
11078           goto reply;
11079         err = normalize_profile(name, profile_map, true, &ss);
11080         if (err)
11081           goto reply;
11082         dout(20) << "erasure code profile set " << profile << "="
11083                  << profile_map << dendl;
11084         pending_inc.set_erasure_code_profile(profile, profile_map);
11085         goto wait;
11086       }
11087     }
11088
11089     int rule;
11090     err = crush_rule_create_erasure(name, profile, &rule, &ss);
11091     if (err < 0) {
11092       switch(err) {
11093       case -EEXIST: // return immediately
11094         ss << "rule " << name << " already exists";
11095         err = 0;
11096         goto reply;
11097         break;
11098       case -EALREADY: // wait for pending to be proposed
11099         ss << "rule " << name << " already exists";
11100         err = 0;
11101         break;
11102       default: // non recoverable error
11103         goto reply;
11104         break;
11105       }
11106     } else {
11107       ss << "created rule " << name << " at " << rule;
11108     }
11109
11110     getline(ss, rs);
11111     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11112                                                       get_last_committed() + 1));
11113     return true;
11114
11115   } else if (prefix == "osd crush rule rm") {
11116     string name;
11117     cmd_getval(cmdmap, "name", name);
11118
11119     if (!osdmap.crush->rule_exists(name)) {
11120       ss << "rule " << name << " does not exist";
11121       err = 0;
11122       goto reply;
11123     }
11124
11125     CrushWrapper newcrush;
11126     _get_pending_crush(newcrush);
11127
11128     if (!newcrush.rule_exists(name)) {
11129       ss << "rule " << name << " does not exist";
11130       err = 0;
11131     } else {
11132       int ruleno = newcrush.get_rule_id(name);
11133       ceph_assert(ruleno >= 0);
11134
11135       // make sure it is not in use.
11136       // FIXME: this is ok in some situations, but let's not bother with that
11137       // complexity now.
11138       int ruleset = newcrush.get_rule_mask_ruleset(ruleno);
11139       if (osdmap.crush_rule_in_use(ruleset)) {
11140         ss << "crush ruleset " << name << " " << ruleset << " is in use";
11141         err = -EBUSY;
11142         goto reply;
11143       }
11144
11145       err = newcrush.remove_rule(ruleno);
11146       if (err < 0) {
11147         goto reply;
11148       }
11149
11150       pending_inc.crush.clear();
11151       newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11152     }
11153     getline(ss, rs);
11154     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11155                                               get_last_committed() + 1));
11156     return true;
11157
11158   } else if (prefix == "osd crush rule rename") {
11159     string srcname;
11160     string dstname;
11161     cmd_getval(cmdmap, "srcname", srcname);
11162     cmd_getval(cmdmap, "dstname", dstname);
11163     if (srcname.empty() || dstname.empty()) {
11164       ss << "must specify both source rule name and destination rule name";
11165       err = -EINVAL;
11166       goto reply;
11167     }
11168     if (srcname == dstname) {
11169       ss << "destination rule name is equal to source rule name";
11170       err = 0;
11171       goto reply;
11172     }
11173
11174     CrushWrapper newcrush;
11175     _get_pending_crush(newcrush);
11176     if (!newcrush.rule_exists(srcname) && newcrush.rule_exists(dstname)) {
11177       // srcname does not exist and dstname already exists
11178       // suppose this is a replay and return success
11179       // (so this command is idempotent)
11180       ss << "already renamed to '" << dstname << "'";
11181       err = 0;
11182       goto reply;
11183     }
11184
11185     err = newcrush.rename_rule(srcname, dstname, &ss);
11186     if (err < 0) {
11187       // ss has reason for failure
11188       goto reply;
11189     }
11190     pending_inc.crush.clear();
11191     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
11192     getline(ss, rs);
11193     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11194                                get_last_committed() + 1));
11195     return true;
11196
11197   } else if (prefix == "osd setmaxosd") {
11198     int64_t newmax;
11199     if (!cmd_getval(cmdmap, "newmax", newmax)) {
11200       ss << "unable to parse 'newmax' value '"
11201          << cmd_vartype_stringify(cmdmap.at("newmax")) << "'";
11202       err = -EINVAL;
11203       goto reply;
11204     }
11205
11206     if (newmax > g_conf()->mon_max_osd) {
11207       err = -ERANGE;
11208       ss << "cannot set max_osd to " << newmax << " which is > conf.mon_max_osd ("
11209          << g_conf()->mon_max_osd << ")";
11210       goto reply;
11211     }
11212
11213     // Don't allow shrinking OSD number as this will cause data loss
11214     // and may cause kernel crashes.
11215     // Note: setmaxosd sets the maximum OSD number and not the number of OSDs
11216     if (newmax < osdmap.get_max_osd()) {
11217       // Check if the OSDs exist between current max and new value.
11218       // If there are any OSDs exist, then don't allow shrinking number
11219       // of OSDs.
11220       for (int i = newmax; i < osdmap.get_max_osd(); i++) {
11221         if (osdmap.exists(i)) {
11222           err = -EBUSY;
11223           ss << "cannot shrink max_osd to " << newmax
11224              << " because osd." << i << " (and possibly others) still in use";
11225           goto reply;
11226         }
11227       }
11228     }
11229
11230     pending_inc.new_max_osd = newmax;
11231     ss << "set new max_osd = " << pending_inc.new_max_osd;
11232     getline(ss, rs);
11233     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11234                                               get_last_committed() + 1));
11235     return true;
11236
11237   } else if (prefix == "osd set-full-ratio" ||
11238              prefix == "osd set-backfillfull-ratio" ||
11239              prefix == "osd set-nearfull-ratio") {
11240     double n;
11241     if (!cmd_getval(cmdmap, "ratio", n)) {
11242       ss << "unable to parse 'ratio' value '"
11243          << cmd_vartype_stringify(cmdmap.at("ratio")) << "'";
11244       err = -EINVAL;
11245       goto reply;
11246     }
11247     if (prefix == "osd set-full-ratio")
11248       pending_inc.new_full_ratio = n;
11249     else if (prefix == "osd set-backfillfull-ratio")
11250       pending_inc.new_backfillfull_ratio = n;
11251     else if (prefix == "osd set-nearfull-ratio")
11252       pending_inc.new_nearfull_ratio = n;
11253     ss << prefix << " " << n;
11254     getline(ss, rs);
11255     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11256                                               get_last_committed() + 1));
11257     return true;
11258   } else if (prefix == "osd set-require-min-compat-client") {
11259     string v;
11260     cmd_getval(cmdmap, "version", v);
11261     ceph_release_t vno = ceph_release_from_name(v);
11262     if (!vno) {
11263       ss << "version " << v << " is not recognized";
11264       err = -EINVAL;
11265       goto reply;
11266     }
11267     OSDMap newmap;
11268     newmap.deepish_copy_from(osdmap);
11269     newmap.apply_incremental(pending_inc);
11270     newmap.require_min_compat_client = vno;
11271     auto mvno = newmap.get_min_compat_client();
11272     if (vno < mvno) {
11273       ss << "osdmap current utilizes features that require " << mvno
11274          << "; cannot set require_min_compat_client below that to " << vno;
11275       err = -EPERM;
11276       goto reply;
11277     }
11278     bool sure = false;
11279     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11280     if (!sure) {
11281       FeatureMap m;
11282       mon.get_combined_feature_map(&m);
11283       uint64_t features = ceph_release_features(to_integer<int>(vno));
11284       bool first = true;
11285       bool ok = true;
11286       for (int type : {
11287             CEPH_ENTITY_TYPE_CLIENT,
11288             CEPH_ENTITY_TYPE_MDS,
11289             CEPH_ENTITY_TYPE_MGR }) {
11290         auto p = m.m.find(type);
11291         if (p == m.m.end()) {
11292           continue;
11293         }
11294         for (auto& q : p->second) {
11295           uint64_t missing = ~q.first & features;
11296           if (missing) {
11297             if (first) {
11298               ss << "cannot set require_min_compat_client to " << v << ": ";
11299             } else {
11300               ss << "; ";
11301             }
11302             first = false;
11303             ss << q.second << " connected " << ceph_entity_type_name(type)
11304                << "(s) look like " << ceph_release_name(
11305                  ceph_release_from_features(q.first))
11306                << " (missing 0x" << std::hex << missing << std::dec << ")";
11307             ok = false;
11308           }
11309         }
11310       }
11311       if (!ok) {
11312         ss << "; add --yes-i-really-mean-it to do it anyway";
11313         err = -EPERM;
11314         goto reply;
11315       }
11316     }
11317     ss << "set require_min_compat_client to " << vno;
11318     pending_inc.new_require_min_compat_client = vno;
11319     getline(ss, rs);
11320     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
11321                                                           get_last_committed() + 1));
11322     return true;
11323   } else if (prefix == "osd pause") {
11324     return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11325
11326   } else if (prefix == "osd unpause") {
11327     return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11328
11329   } else if (prefix == "osd set") {
11330     bool sure = false;
11331     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11332
11333     string key;
11334     cmd_getval(cmdmap, "key", key);
11335     if (key == "pause")
11336       return prepare_set_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11337     else if (key == "noup")
11338       return prepare_set_flag(op, CEPH_OSDMAP_NOUP);
11339     else if (key == "nodown")
11340       return prepare_set_flag(op, CEPH_OSDMAP_NODOWN);
11341     else if (key == "noout")
11342       return prepare_set_flag(op, CEPH_OSDMAP_NOOUT);
11343     else if (key == "noin")
11344       return prepare_set_flag(op, CEPH_OSDMAP_NOIN);
11345     else if (key == "nobackfill")
11346       return prepare_set_flag(op, CEPH_OSDMAP_NOBACKFILL);
11347     else if (key == "norebalance")
11348       return prepare_set_flag(op, CEPH_OSDMAP_NOREBALANCE);
11349     else if (key == "norecover")
11350       return prepare_set_flag(op, CEPH_OSDMAP_NORECOVER);
11351     else if (key == "noscrub")
11352       return prepare_set_flag(op, CEPH_OSDMAP_NOSCRUB);
11353     else if (key == "nodeep-scrub")
11354       return prepare_set_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11355     else if (key == "notieragent")
11356       return prepare_set_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11357     else if (key == "nosnaptrim")
11358       return prepare_set_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11359     else if (key == "pglog_hardlimit") {
11360       if (!osdmap.get_num_up_osds() && !sure) {
11361         ss << "Not advisable to continue since no OSDs are up. Pass "
11362            << "--yes-i-really-mean-it if you really wish to continue.";
11363         err = -EPERM;
11364         goto reply;
11365       }
11366       // The release check here is required because for OSD_PGLOG_HARDLIMIT,
11367       // we are reusing a jewel feature bit that was retired in luminous.
11368       if (osdmap.require_osd_release >= ceph_release_t::luminous &&
11369          (HAVE_FEATURE(osdmap.get_up_osd_features(), OSD_PGLOG_HARDLIMIT)
11370           || sure)) {
11371         return prepare_set_flag(op, CEPH_OSDMAP_PGLOG_HARDLIMIT);
11372       } else {
11373         ss << "not all up OSDs have OSD_PGLOG_HARDLIMIT feature";
11374         err = -EPERM;
11375         goto reply;
11376       }
11377     } else {
11378       ss << "unrecognized flag '" << key << "'";
11379       err = -EINVAL;
11380     }
11381
11382   } else if (prefix == "osd unset") {
11383     string key;
11384     cmd_getval(cmdmap, "key", key);
11385     if (key == "pause")
11386       return prepare_unset_flag(op, CEPH_OSDMAP_PAUSERD | CEPH_OSDMAP_PAUSEWR);
11387     else if (key == "noup")
11388       return prepare_unset_flag(op, CEPH_OSDMAP_NOUP);
11389     else if (key == "nodown")
11390       return prepare_unset_flag(op, CEPH_OSDMAP_NODOWN);
11391     else if (key == "noout")
11392       return prepare_unset_flag(op, CEPH_OSDMAP_NOOUT);
11393     else if (key == "noin")
11394       return prepare_unset_flag(op, CEPH_OSDMAP_NOIN);
11395     else if (key == "nobackfill")
11396       return prepare_unset_flag(op, CEPH_OSDMAP_NOBACKFILL);
11397     else if (key == "norebalance")
11398       return prepare_unset_flag(op, CEPH_OSDMAP_NOREBALANCE);
11399     else if (key == "norecover")
11400       return prepare_unset_flag(op, CEPH_OSDMAP_NORECOVER);
11401     else if (key == "noscrub")
11402       return prepare_unset_flag(op, CEPH_OSDMAP_NOSCRUB);
11403     else if (key == "nodeep-scrub")
11404       return prepare_unset_flag(op, CEPH_OSDMAP_NODEEP_SCRUB);
11405     else if (key == "notieragent")
11406       return prepare_unset_flag(op, CEPH_OSDMAP_NOTIERAGENT);
11407     else if (key == "nosnaptrim")
11408       return prepare_unset_flag(op, CEPH_OSDMAP_NOSNAPTRIM);
11409     else {
11410       ss << "unrecognized flag '" << key << "'";
11411       err = -EINVAL;
11412     }
11413
11414   } else if (prefix == "osd require-osd-release") {
11415     string release;
11416     cmd_getval(cmdmap, "release", release);
11417     bool sure = false;
11418     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11419     ceph_release_t rel = ceph_release_from_name(release.c_str());
11420     if (!rel) {
11421       ss << "unrecognized release " << release;
11422       err = -EINVAL;
11423       goto reply;
11424     }
11425     if (rel == osdmap.require_osd_release) {
11426       // idempotent
11427       err = 0;
11428       goto reply;
11429     }
11430     ceph_assert(osdmap.require_osd_release >= ceph_release_t::luminous);
11431     if (!osdmap.get_num_up_osds() && !sure) {
11432       ss << "Not advisable to continue since no OSDs are up. Pass "
11433          << "--yes-i-really-mean-it if you really wish to continue.";
11434       err = -EPERM;
11435       goto reply;
11436     }
11437     if (rel == ceph_release_t::mimic) {
11438       if (!mon.monmap->get_required_features().contains_all(
11439             ceph::features::mon::FEATURE_MIMIC)) {
11440         ss << "not all mons are mimic";
11441         err = -EPERM;
11442         goto reply;
11443       }
11444       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_MIMIC))
11445            && !sure) {
11446         ss << "not all up OSDs have CEPH_FEATURE_SERVER_MIMIC feature";
11447         err = -EPERM;
11448         goto reply;
11449       }
11450     } else if (rel == ceph_release_t::nautilus) {
11451       if (!mon.monmap->get_required_features().contains_all(
11452             ceph::features::mon::FEATURE_NAUTILUS)) {
11453         ss << "not all mons are nautilus";
11454         err = -EPERM;
11455         goto reply;
11456       }
11457       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_NAUTILUS))
11458            && !sure) {
11459         ss << "not all up OSDs have CEPH_FEATURE_SERVER_NAUTILUS feature";
11460         err = -EPERM;
11461         goto reply;
11462       }
11463     } else if (rel == ceph_release_t::octopus) {
11464       if (!mon.monmap->get_required_features().contains_all(
11465             ceph::features::mon::FEATURE_OCTOPUS)) {
11466         ss << "not all mons are octopus";
11467         err = -EPERM;
11468         goto reply;
11469       }
11470       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_OCTOPUS))
11471            && !sure) {
11472         ss << "not all up OSDs have CEPH_FEATURE_SERVER_OCTOPUS feature";
11473         err = -EPERM;
11474         goto reply;
11475       }
11476     } else if (rel == ceph_release_t::pacific) {
11477       if (!mon.monmap->get_required_features().contains_all(
11478             ceph::features::mon::FEATURE_PACIFIC)) {
11479         ss << "not all mons are pacific";
11480         err = -EPERM;
11481         goto reply;
11482       }
11483       if ((!HAVE_FEATURE(osdmap.get_up_osd_features(), SERVER_PACIFIC))
11484            && !sure) {
11485         ss << "not all up OSDs have CEPH_FEATURE_SERVER_PACIFIC feature";
11486         err = -EPERM;
11487         goto reply;
11488       }
11489     } else {
11490       ss << "not supported for this release yet";
11491       err = -EPERM;
11492       goto reply;
11493     }
11494     if (rel < osdmap.require_osd_release) {
11495       ss << "require_osd_release cannot be lowered once it has been set";
11496       err = -EPERM;
11497       goto reply;
11498     }
11499     pending_inc.new_require_osd_release = rel;
11500     goto update;
11501   } else if (prefix == "osd down" ||
11502              prefix == "osd out" ||
11503              prefix == "osd in" ||
11504              prefix == "osd rm" ||
11505              prefix == "osd stop") {
11506
11507     bool any = false;
11508     bool stop = false;
11509     bool verbose = true;
11510     bool definitely_dead = false;
11511
11512     vector<string> idvec;
11513     cmd_getval(cmdmap, "ids", idvec);
11514     cmd_getval(cmdmap, "definitely_dead", definitely_dead);
11515     derr << "definitely_dead " << (int)definitely_dead << dendl;
11516     for (unsigned j = 0; j < idvec.size() && !stop; j++) {
11517       set<int> osds;
11518
11519       // wildcard?
11520       if (j == 0 &&
11521           (idvec[0] == "any" || idvec[0] == "all" || idvec[0] == "*")) {
11522         if (prefix == "osd in") {
11523           // touch out osds only
11524           osdmap.get_out_existing_osds(osds);
11525         } else {
11526           osdmap.get_all_osds(osds);
11527         }
11528         stop = true;
11529         verbose = false; // so the output is less noisy.
11530       } else {
11531         long osd = parse_osd_id(idvec[j].c_str(), &ss);
11532         if (osd < 0) {
11533           ss << "invalid osd id" << osd;
11534           err = -EINVAL;
11535           continue;
11536         } else if (!osdmap.exists(osd)) {
11537           ss << "osd." << osd << " does not exist. ";
11538           continue;
11539         }
11540
11541         osds.insert(osd);
11542       }
11543
11544       for (auto &osd : osds) {
11545         if (prefix == "osd down") {
11546           if (osdmap.is_down(osd)) {
11547             if (verbose)
11548               ss << "osd." << osd << " is already down. ";
11549           } else {
11550             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP);
11551             ss << "marked down osd." << osd << ". ";
11552             any = true;
11553           }
11554           if (definitely_dead) {
11555             if (!pending_inc.new_xinfo.count(osd)) {
11556               pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11557             }
11558             if (pending_inc.new_xinfo[osd].dead_epoch < pending_inc.epoch) {
11559               any = true;
11560             }
11561             pending_inc.new_xinfo[osd].dead_epoch = pending_inc.epoch;
11562           }
11563         } else if (prefix == "osd out") {
11564           if (osdmap.is_out(osd)) {
11565             if (verbose)
11566               ss << "osd." << osd << " is already out. ";
11567           } else {
11568             pending_inc.new_weight[osd] = CEPH_OSD_OUT;
11569             if (osdmap.osd_weight[osd]) {
11570               if (pending_inc.new_xinfo.count(osd) == 0) {
11571                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11572               }
11573               pending_inc.new_xinfo[osd].old_weight = osdmap.osd_weight[osd];
11574             }
11575             ss << "marked out osd." << osd << ". ";
11576             std::ostringstream msg;
11577             msg << "Client " << op->get_session()->entity_name
11578                 << " marked osd." << osd << " out";
11579             if (osdmap.is_up(osd)) {
11580               msg << ", while it was still marked up";
11581             } else {
11582               auto period = ceph_clock_now() - down_pending_out[osd];
11583               msg << ", after it was down for " << int(period.sec())
11584                   << " seconds";
11585             }
11586
11587             mon.clog->info() << msg.str();
11588             any = true;
11589           }
11590         } else if (prefix == "osd in") {
11591           if (osdmap.is_in(osd)) {
11592             if (verbose)
11593               ss << "osd." << osd << " is already in. ";
11594           } else {
11595             if (osdmap.osd_xinfo[osd].old_weight > 0) {
11596               pending_inc.new_weight[osd] = osdmap.osd_xinfo[osd].old_weight;
11597               if (pending_inc.new_xinfo.count(osd) == 0) {
11598                 pending_inc.new_xinfo[osd] = osdmap.osd_xinfo[osd];
11599               }
11600               pending_inc.new_xinfo[osd].old_weight = 0;
11601             } else {
11602               pending_inc.new_weight[osd] = CEPH_OSD_IN;
11603             }
11604             ss << "marked in osd." << osd << ". ";
11605             any = true;
11606           }
11607         } else if (prefix == "osd rm") {
11608           err = prepare_command_osd_remove(osd);
11609
11610           if (err == -EBUSY) {
11611             if (any)
11612               ss << ", ";
11613             ss << "osd." << osd << " is still up; must be down before removal. ";
11614           } else {
11615             ceph_assert(err == 0);
11616             if (any) {
11617               ss << ", osd." << osd;
11618             } else {
11619               ss << "removed osd." << osd;
11620             }
11621             any = true;
11622           }
11623         } else if (prefix == "osd stop") {
11624           if (osdmap.is_stop(osd)) {
11625             if (verbose)
11626               ss << "osd." << osd << " is already stopped. ";
11627           } else if (osdmap.is_down(osd)) {
11628             pending_inc.pending_osd_state_set(osd, CEPH_OSD_STOP);
11629             ss << "stop down osd." << osd << ". ";
11630             any = true;
11631           } else {
11632             pending_inc.pending_osd_state_set(osd, CEPH_OSD_UP | CEPH_OSD_STOP);
11633             ss << "stop osd." << osd << ". ";
11634             any = true;
11635           }
11636         }
11637       }
11638     }
11639     if (any) {
11640       getline(ss, rs);
11641       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11642                                                 get_last_committed() + 1));
11643       return true;
11644     }
11645   } else if (prefix == "osd set-group" ||
11646              prefix == "osd unset-group" ||
11647              prefix == "osd add-noup" ||
11648              prefix == "osd add-nodown" ||
11649              prefix == "osd add-noin" ||
11650              prefix == "osd add-noout" ||
11651              prefix == "osd rm-noup" ||
11652              prefix == "osd rm-nodown" ||
11653              prefix == "osd rm-noin" ||
11654              prefix == "osd rm-noout") {
11655     bool do_set = prefix == "osd set-group" ||
11656                   prefix.find("add") != string::npos;
11657     string flag_str;
11658     unsigned flags = 0;
11659     vector<string> who;
11660     if (prefix == "osd set-group" || prefix == "osd unset-group") {
11661       cmd_getval(cmdmap, "flags", flag_str);
11662       cmd_getval(cmdmap, "who", who);
11663       vector<string> raw_flags;
11664       boost::split(raw_flags, flag_str, boost::is_any_of(","));
11665       for (auto& f : raw_flags) {
11666         if (f == "noup")
11667           flags |= CEPH_OSD_NOUP;
11668         else if (f == "nodown")
11669           flags |= CEPH_OSD_NODOWN;
11670         else if (f == "noin")
11671           flags |= CEPH_OSD_NOIN;
11672         else if (f == "noout")
11673           flags |= CEPH_OSD_NOOUT;
11674         else {
11675           ss << "unrecognized flag '" << f << "', must be one of "
11676              << "{noup,nodown,noin,noout}";
11677           err = -EINVAL;
11678           goto reply;
11679         }
11680       }
11681     } else {
11682       cmd_getval(cmdmap, "ids", who);
11683       if (prefix.find("noup") != string::npos)
11684         flags = CEPH_OSD_NOUP;
11685       else if (prefix.find("nodown") != string::npos)
11686         flags = CEPH_OSD_NODOWN;
11687       else if (prefix.find("noin") != string::npos)
11688         flags = CEPH_OSD_NOIN;
11689       else if (prefix.find("noout") != string::npos)
11690         flags = CEPH_OSD_NOOUT;
11691       else
11692         ceph_assert(0 == "Unreachable!");
11693     }
11694     if (flags == 0) {
11695       ss << "must specify flag(s) {noup,nodwon,noin,noout} to set/unset";
11696       err = -EINVAL;
11697       goto reply;
11698     }
11699     if (who.empty()) {
11700       ss << "must specify at least one or more targets to set/unset";
11701       err = -EINVAL;
11702       goto reply;
11703     }
11704     set<int> osds;
11705     set<int> crush_nodes;
11706     set<int> device_classes;
11707     for (auto& w : who) {
11708       if (w == "any" || w == "all" || w == "*") {
11709         osdmap.get_all_osds(osds);
11710         break;
11711       }
11712       std::stringstream ts;
11713       if (auto osd = parse_osd_id(w.c_str(), &ts); osd >= 0) {
11714         osds.insert(osd);
11715       } else if (osdmap.crush->name_exists(w)) {
11716         crush_nodes.insert(osdmap.crush->get_item_id(w));
11717       } else if (osdmap.crush->class_exists(w)) {
11718         device_classes.insert(osdmap.crush->get_class_id(w));
11719       } else {
11720         ss << "unable to parse osd id or crush node or device class: "
11721            << "\"" << w << "\". ";
11722       }
11723     }
11724     if (osds.empty() && crush_nodes.empty() && device_classes.empty()) {
11725       // ss has reason for failure
11726       err = -EINVAL;
11727       goto reply;
11728     }
11729     bool any = false;
11730     for (auto osd : osds) {
11731       if (!osdmap.exists(osd)) {
11732         ss << "osd." << osd << " does not exist. ";
11733         continue;
11734       }
11735       if (do_set) {
11736         if (flags & CEPH_OSD_NOUP) {
11737           any |= osdmap.is_noup_by_osd(osd) ?
11738             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP) :
11739             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP);
11740         }
11741         if (flags & CEPH_OSD_NODOWN) {
11742           any |= osdmap.is_nodown_by_osd(osd) ?
11743             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN) :
11744             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN);
11745         }
11746         if (flags & CEPH_OSD_NOIN) {
11747           any |= osdmap.is_noin_by_osd(osd) ?
11748             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN) :
11749             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN);
11750         }
11751         if (flags & CEPH_OSD_NOOUT) {
11752           any |= osdmap.is_noout_by_osd(osd) ?
11753             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT) :
11754             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT);
11755         }
11756       } else {
11757         if (flags & CEPH_OSD_NOUP) {
11758           any |= osdmap.is_noup_by_osd(osd) ?
11759             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOUP) :
11760             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOUP);
11761         }
11762         if (flags & CEPH_OSD_NODOWN) {
11763           any |= osdmap.is_nodown_by_osd(osd) ?
11764             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NODOWN) :
11765             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NODOWN);
11766         }
11767         if (flags & CEPH_OSD_NOIN) {
11768           any |= osdmap.is_noin_by_osd(osd) ?
11769             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOIN) :
11770             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOIN);
11771         }
11772         if (flags & CEPH_OSD_NOOUT) {
11773           any |= osdmap.is_noout_by_osd(osd) ?
11774             pending_inc.pending_osd_state_set(osd, CEPH_OSD_NOOUT) :
11775             pending_inc.pending_osd_state_clear(osd, CEPH_OSD_NOOUT);
11776         }
11777       }
11778     }
11779     for (auto& id : crush_nodes) {
11780       auto old_flags = osdmap.get_crush_node_flags(id);
11781       auto& pending_flags = pending_inc.new_crush_node_flags[id];
11782       pending_flags |= old_flags; // adopt existing flags first!
11783       if (do_set) {
11784         pending_flags |= flags;
11785       } else {
11786         pending_flags &= ~flags;
11787       }
11788       any = true;
11789     }
11790     for (auto& id : device_classes) {
11791       auto old_flags = osdmap.get_device_class_flags(id);
11792       auto& pending_flags = pending_inc.new_device_class_flags[id];
11793       pending_flags |= old_flags;
11794       if (do_set) {
11795         pending_flags |= flags;
11796       } else {
11797         pending_flags &= ~flags;
11798       }
11799       any = true;
11800     }
11801     if (any) {
11802       getline(ss, rs);
11803       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, err, rs,
11804                                  get_last_committed() + 1));
11805       return true;
11806     }
11807   } else if (prefix == "osd pg-temp") {
11808     string pgidstr;
11809     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11810       ss << "unable to parse 'pgid' value '"
11811          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11812       err = -EINVAL;
11813       goto reply;
11814     }
11815     pg_t pgid;
11816     if (!pgid.parse(pgidstr.c_str())) {
11817       ss << "invalid pgid '" << pgidstr << "'";
11818       err = -EINVAL;
11819       goto reply;
11820     }
11821     if (!osdmap.pg_exists(pgid)) {
11822       ss << "pg " << pgid << " does not exist";
11823       err = -ENOENT;
11824       goto reply;
11825     }
11826     if (pending_inc.new_pg_temp.count(pgid)) {
11827       dout(10) << __func__ << " waiting for pending update on " << pgid << dendl;
11828       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
11829       return true;
11830     }
11831
11832     vector<int64_t> id_vec;
11833     vector<int32_t> new_pg_temp;
11834     cmd_getval(cmdmap, "id", id_vec);
11835     if (id_vec.empty())  {
11836       pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>();
11837       ss << "done cleaning up pg_temp of " << pgid;
11838       goto update;
11839     }
11840     for (auto osd : id_vec) {
11841       if (!osdmap.exists(osd)) {
11842         ss << "osd." << osd << " does not exist";
11843         err = -ENOENT;
11844         goto reply;
11845       }
11846       new_pg_temp.push_back(osd);
11847     }
11848
11849     int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
11850     if ((int)new_pg_temp.size() < pool_min_size) {
11851       ss << "num of osds (" << new_pg_temp.size() <<") < pool min size ("
11852          << pool_min_size << ")";
11853       err = -EINVAL;
11854       goto reply;
11855     }
11856
11857     int pool_size = osdmap.get_pg_pool_size(pgid);
11858     if ((int)new_pg_temp.size() > pool_size) {
11859       ss << "num of osds (" << new_pg_temp.size() <<") > pool size ("
11860          << pool_size << ")";
11861       err = -EINVAL;
11862       goto reply;
11863     }
11864
11865     pending_inc.new_pg_temp[pgid] = mempool::osdmap::vector<int>(
11866       new_pg_temp.begin(), new_pg_temp.end());
11867     ss << "set " << pgid << " pg_temp mapping to " << new_pg_temp;
11868     goto update;
11869   } else if (prefix == "osd primary-temp") {
11870     string pgidstr;
11871     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11872       ss << "unable to parse 'pgid' value '"
11873          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11874       err = -EINVAL;
11875       goto reply;
11876     }
11877     pg_t pgid;
11878     if (!pgid.parse(pgidstr.c_str())) {
11879       ss << "invalid pgid '" << pgidstr << "'";
11880       err = -EINVAL;
11881       goto reply;
11882     }
11883     if (!osdmap.pg_exists(pgid)) {
11884       ss << "pg " << pgid << " does not exist";
11885       err = -ENOENT;
11886       goto reply;
11887     }
11888
11889     int64_t osd;
11890     if (!cmd_getval(cmdmap, "id", osd)) {
11891       ss << "unable to parse 'id' value '"
11892          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
11893       err = -EINVAL;
11894       goto reply;
11895     }
11896     if (osd != -1 && !osdmap.exists(osd)) {
11897       ss << "osd." << osd << " does not exist";
11898       err = -ENOENT;
11899       goto reply;
11900     }
11901
11902     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
11903         osdmap.require_min_compat_client < ceph_release_t::firefly) {
11904       ss << "require_min_compat_client "
11905          << osdmap.require_min_compat_client
11906          << " < firefly, which is required for primary-temp";
11907       err = -EPERM;
11908       goto reply;
11909     }
11910
11911     pending_inc.new_primary_temp[pgid] = osd;
11912     ss << "set " << pgid << " primary_temp mapping to " << osd;
11913     goto update;
11914   } else if (prefix == "pg repeer") {
11915     pg_t pgid;
11916     string pgidstr;
11917     cmd_getval(cmdmap, "pgid", pgidstr);
11918     if (!pgid.parse(pgidstr.c_str())) {
11919       ss << "invalid pgid '" << pgidstr << "'";
11920       err = -EINVAL;
11921       goto reply;
11922     }
11923     if (!osdmap.pg_exists(pgid)) {
11924       ss << "pg '" << pgidstr << "' does not exist";
11925       err = -ENOENT;
11926       goto reply;
11927     }
11928     vector<int> acting;
11929     int primary;
11930     osdmap.pg_to_acting_osds(pgid, &acting, &primary);
11931     if (primary < 0) {
11932       err = -EAGAIN;
11933       ss << "pg currently has no primary";
11934       goto reply;
11935     }
11936     if (acting.size() > 1) {
11937       // map to just primary; it will map back to what it wants
11938       pending_inc.new_pg_temp[pgid] = { primary };
11939     } else {
11940       // hmm, pick another arbitrary osd to induce a change.  Note
11941       // that this won't work if there is only one suitable OSD in the cluster.
11942       int i;
11943       bool done = false;
11944       for (i = 0; i < osdmap.get_max_osd(); ++i) {
11945         if (i == primary || !osdmap.is_up(i) || !osdmap.exists(i)) {
11946           continue;
11947         }
11948         pending_inc.new_pg_temp[pgid] = { primary, i };
11949         done = true;
11950         break;
11951       }
11952       if (!done) {
11953         err = -EAGAIN;
11954         ss << "not enough up OSDs in the cluster to force repeer";
11955         goto reply;
11956       }
11957     }
11958     goto update;
11959   } else if (prefix == "osd pg-upmap" ||
11960              prefix == "osd rm-pg-upmap" ||
11961              prefix == "osd pg-upmap-items" ||
11962              prefix == "osd rm-pg-upmap-items") {
11963     if (osdmap.require_min_compat_client < ceph_release_t::luminous) {
11964       ss << "min_compat_client "
11965          << osdmap.require_min_compat_client
11966          << " < luminous, which is required for pg-upmap. "
11967          << "Try 'ceph osd set-require-min-compat-client luminous' "
11968          << "before using the new interface";
11969       err = -EPERM;
11970       goto reply;
11971     }
11972     err = check_cluster_features(CEPH_FEATUREMASK_OSDMAP_PG_UPMAP, ss);
11973     if (err == -EAGAIN)
11974       goto wait;
11975     if (err < 0)
11976       goto reply;
11977     string pgidstr;
11978     if (!cmd_getval(cmdmap, "pgid", pgidstr)) {
11979       ss << "unable to parse 'pgid' value '"
11980          << cmd_vartype_stringify(cmdmap.at("pgid")) << "'";
11981       err = -EINVAL;
11982       goto reply;
11983     }
11984     pg_t pgid;
11985     if (!pgid.parse(pgidstr.c_str())) {
11986       ss << "invalid pgid '" << pgidstr << "'";
11987       err = -EINVAL;
11988       goto reply;
11989     }
11990     if (!osdmap.pg_exists(pgid)) {
11991       ss << "pg " << pgid << " does not exist";
11992       err = -ENOENT;
11993       goto reply;
11994     }
11995     if (pending_inc.old_pools.count(pgid.pool())) {
11996       ss << "pool of " << pgid << " is pending removal";
11997       err = -ENOENT;
11998       getline(ss, rs);
11999       wait_for_finished_proposal(op,
12000         new Monitor::C_Command(mon, op, err, rs, get_last_committed() + 1));
12001       return true;
12002     }
12003
12004     enum {
12005       OP_PG_UPMAP,
12006       OP_RM_PG_UPMAP,
12007       OP_PG_UPMAP_ITEMS,
12008       OP_RM_PG_UPMAP_ITEMS,
12009     } option;
12010
12011     if (prefix == "osd pg-upmap") {
12012       option = OP_PG_UPMAP;
12013     } else if (prefix == "osd rm-pg-upmap") {
12014       option = OP_RM_PG_UPMAP;
12015     } else if (prefix == "osd pg-upmap-items") {
12016       option = OP_PG_UPMAP_ITEMS;
12017     } else {
12018       option = OP_RM_PG_UPMAP_ITEMS;
12019     }
12020
12021     // check pending upmap changes
12022     switch (option) {
12023     case OP_PG_UPMAP: // fall through
12024     case OP_RM_PG_UPMAP:
12025       if (pending_inc.new_pg_upmap.count(pgid) ||
12026           pending_inc.old_pg_upmap.count(pgid)) {
12027         dout(10) << __func__ << " waiting for pending update on "
12028                  << pgid << dendl;
12029         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12030         return true;
12031       }
12032       break;
12033
12034     case OP_PG_UPMAP_ITEMS: // fall through
12035     case OP_RM_PG_UPMAP_ITEMS:
12036       if (pending_inc.new_pg_upmap_items.count(pgid) ||
12037           pending_inc.old_pg_upmap_items.count(pgid)) {
12038         dout(10) << __func__ << " waiting for pending update on "
12039                  << pgid << dendl;
12040         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12041         return true;
12042       }
12043       break;
12044
12045     default:
12046       ceph_abort_msg("invalid option");
12047     }
12048
12049     switch (option) {
12050     case OP_PG_UPMAP:
12051       {
12052         vector<int64_t> id_vec;
12053         if (!cmd_getval(cmdmap, "id", id_vec)) {
12054           ss << "unable to parse 'id' value(s) '"
12055              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12056           err = -EINVAL;
12057           goto reply;
12058         }
12059
12060         int pool_min_size = osdmap.get_pg_pool_min_size(pgid);
12061         if ((int)id_vec.size() < pool_min_size) {
12062           ss << "num of osds (" << id_vec.size() <<") < pool min size ("
12063              << pool_min_size << ")";
12064           err = -EINVAL;
12065           goto reply;
12066         }
12067
12068         int pool_size = osdmap.get_pg_pool_size(pgid);
12069         if ((int)id_vec.size() > pool_size) {
12070           ss << "num of osds (" << id_vec.size() <<") > pool size ("
12071              << pool_size << ")";
12072           err = -EINVAL;
12073           goto reply;
12074         }
12075
12076         vector<int32_t> new_pg_upmap;
12077         for (auto osd : id_vec) {
12078           if (osd != CRUSH_ITEM_NONE && !osdmap.exists(osd)) {
12079             ss << "osd." << osd << " does not exist";
12080             err = -ENOENT;
12081             goto reply;
12082           }
12083           auto it = std::find(new_pg_upmap.begin(), new_pg_upmap.end(), osd);
12084           if (it != new_pg_upmap.end()) {
12085             ss << "osd." << osd << " already exists, ";
12086             continue;
12087           }
12088           new_pg_upmap.push_back(osd);
12089         }
12090
12091         if (new_pg_upmap.empty()) {
12092           ss << "no valid upmap items(pairs) is specified";
12093           err = -EINVAL;
12094           goto reply;
12095         }
12096
12097         pending_inc.new_pg_upmap[pgid] = mempool::osdmap::vector<int32_t>(
12098           new_pg_upmap.begin(), new_pg_upmap.end());
12099         ss << "set " << pgid << " pg_upmap mapping to " << new_pg_upmap;
12100       }
12101       break;
12102
12103     case OP_RM_PG_UPMAP:
12104       {
12105         pending_inc.old_pg_upmap.insert(pgid);
12106         ss << "clear " << pgid << " pg_upmap mapping";
12107       }
12108       break;
12109
12110     case OP_PG_UPMAP_ITEMS:
12111       {
12112         vector<int64_t> id_vec;
12113         if (!cmd_getval(cmdmap, "id", id_vec)) {
12114           ss << "unable to parse 'id' value(s) '"
12115              << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12116           err = -EINVAL;
12117           goto reply;
12118         }
12119
12120         if (id_vec.size() % 2) {
12121           ss << "you must specify pairs of osd ids to be remapped";
12122           err = -EINVAL;
12123           goto reply;
12124         }
12125
12126         int pool_size = osdmap.get_pg_pool_size(pgid);
12127         if ((int)(id_vec.size() / 2) > pool_size) {
12128           ss << "num of osd pairs (" << id_vec.size() / 2 <<") > pool size ("
12129              << pool_size << ")";
12130           err = -EINVAL;
12131           goto reply;
12132         }
12133
12134         vector<pair<int32_t,int32_t>> new_pg_upmap_items;
12135         ostringstream items;
12136         items << "[";
12137         for (auto p = id_vec.begin(); p != id_vec.end(); ++p) {
12138           int from = *p++;
12139           int to = *p;
12140           if (from == to) {
12141             ss << "from osd." << from << " == to osd." << to << ", ";
12142             continue;
12143           }
12144           if (!osdmap.exists(from)) {
12145             ss << "osd." << from << " does not exist";
12146             err = -ENOENT;
12147             goto reply;
12148           }
12149           if (to != CRUSH_ITEM_NONE && !osdmap.exists(to)) {
12150             ss << "osd." << to << " does not exist";
12151             err = -ENOENT;
12152             goto reply;
12153           }
12154           pair<int32_t,int32_t> entry = make_pair(from, to);
12155           auto it = std::find(new_pg_upmap_items.begin(),
12156             new_pg_upmap_items.end(), entry);
12157           if (it != new_pg_upmap_items.end()) {
12158             ss << "osd." << from << " -> osd." << to << " already exists, ";
12159             continue;
12160           }
12161           new_pg_upmap_items.push_back(entry);
12162           items << from << "->" << to << ",";
12163         }
12164         string out(items.str());
12165         out.resize(out.size() - 1); // drop last ','
12166         out += "]";
12167
12168         if (new_pg_upmap_items.empty()) {
12169           ss << "no valid upmap items(pairs) is specified";
12170           err = -EINVAL;
12171           goto reply;
12172         }
12173
12174         pending_inc.new_pg_upmap_items[pgid] =
12175           mempool::osdmap::vector<pair<int32_t,int32_t>>(
12176           new_pg_upmap_items.begin(), new_pg_upmap_items.end());
12177         ss << "set " << pgid << " pg_upmap_items mapping to " << out;
12178       }
12179       break;
12180
12181     case OP_RM_PG_UPMAP_ITEMS:
12182       {
12183         pending_inc.old_pg_upmap_items.insert(pgid);
12184         ss << "clear " << pgid << " pg_upmap_items mapping";
12185       }
12186       break;
12187
12188     default:
12189       ceph_abort_msg("invalid option");
12190     }
12191
12192     goto update;
12193   } else if (prefix == "osd primary-affinity") {
12194     int64_t id;
12195     if (!cmd_getval(cmdmap, "id", id)) {
12196       ss << "invalid osd id value '"
12197          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12198       err = -EINVAL;
12199       goto reply;
12200     }
12201     double w;
12202     if (!cmd_getval(cmdmap, "weight", w)) {
12203       ss << "unable to parse 'weight' value '"
12204          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12205       err = -EINVAL;
12206       goto reply;
12207     }
12208     long ww = (int)((double)CEPH_OSD_MAX_PRIMARY_AFFINITY*w);
12209     if (ww < 0L) {
12210       ss << "weight must be >= 0";
12211       err = -EINVAL;
12212       goto reply;
12213     }
12214     if (osdmap.require_min_compat_client != ceph_release_t::unknown &&
12215         osdmap.require_min_compat_client < ceph_release_t::firefly) {
12216       ss << "require_min_compat_client "
12217          << osdmap.require_min_compat_client
12218          << " < firefly, which is required for primary-affinity";
12219       err = -EPERM;
12220       goto reply;
12221     }
12222     if (osdmap.exists(id)) {
12223       pending_inc.new_primary_affinity[id] = ww;
12224       ss << "set osd." << id << " primary-affinity to " << w << " (" << std::ios::hex << ww << std::ios::dec << ")";
12225       getline(ss, rs);
12226       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12227                                                 get_last_committed() + 1));
12228       return true;
12229     } else {
12230       ss << "osd." << id << " does not exist";
12231       err = -ENOENT;
12232       goto reply;
12233     }
12234   } else if (prefix == "osd reweight") {
12235     int64_t id;
12236     if (!cmd_getval(cmdmap, "id", id)) {
12237       ss << "unable to parse osd id value '"
12238          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12239       err = -EINVAL;
12240       goto reply;
12241     }
12242     double w;
12243     if (!cmd_getval(cmdmap, "weight", w)) {
12244       ss << "unable to parse weight value '"
12245          << cmd_vartype_stringify(cmdmap.at("weight")) << "'";
12246       err = -EINVAL;
12247       goto reply;
12248     }
12249     long ww = (int)((double)CEPH_OSD_IN*w);
12250     if (ww < 0L) {
12251       ss << "weight must be >= 0";
12252       err = -EINVAL;
12253       goto reply;
12254     }
12255     if (osdmap.exists(id)) {
12256       pending_inc.new_weight[id] = ww;
12257       ss << "reweighted osd." << id << " to " << w << " (" << std::hex << ww << std::dec << ")";
12258       getline(ss, rs);
12259       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12260                                                 get_last_committed() + 1));
12261       return true;
12262     } else {
12263       ss << "osd." << id << " does not exist";
12264       err = -ENOENT;
12265       goto reply;
12266     }
12267   } else if (prefix == "osd reweightn") {
12268     map<int32_t, uint32_t> weights;
12269     err = parse_reweights(cct, cmdmap, osdmap, &weights);
12270     if (err) {
12271       ss << "unable to parse 'weights' value '"
12272          << cmd_vartype_stringify(cmdmap.at("weights")) << "'";
12273       goto reply;
12274     }
12275     pending_inc.new_weight.insert(weights.begin(), weights.end());
12276     wait_for_finished_proposal(
12277         op,
12278         new Monitor::C_Command(mon, op, 0, rs, rdata, get_last_committed() + 1));
12279     return true;
12280   } else if (prefix == "osd lost") {
12281     int64_t id;
12282     if (!cmd_getval(cmdmap, "id", id)) {
12283       ss << "unable to parse osd id value '"
12284          << cmd_vartype_stringify(cmdmap.at("id")) << "'";
12285       err = -EINVAL;
12286       goto reply;
12287     }
12288     bool sure = false;
12289     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12290     if (!sure) {
12291       ss << "are you SURE?  this might mean real, permanent data loss.  pass "
12292             "--yes-i-really-mean-it if you really do.";
12293       err = -EPERM;
12294       goto reply;
12295     } else if (!osdmap.exists(id)) {
12296       ss << "osd." << id << " does not exist";
12297       err = -ENOENT;
12298       goto reply;
12299     } else if (!osdmap.is_down(id)) {
12300       ss << "osd." << id << " is not down";
12301       err = -EBUSY;
12302       goto reply;
12303     } else {
12304       epoch_t e = osdmap.get_info(id).down_at;
12305       pending_inc.new_lost[id] = e;
12306       ss << "marked osd lost in epoch " << e;
12307       getline(ss, rs);
12308       wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12309                                                 get_last_committed() + 1));
12310       return true;
12311     }
12312
12313   } else if (prefix == "osd destroy-actual" ||
12314              prefix == "osd purge-actual" ||
12315              prefix == "osd purge-new") {
12316     /* Destroying an OSD means that we don't expect to further make use of
12317      * the OSDs data (which may even become unreadable after this operation),
12318      * and that we are okay with scrubbing all its cephx keys and config-key
12319      * data (which may include lockbox keys, thus rendering the osd's data
12320      * unreadable).
12321      *
12322      * The OSD will not be removed. Instead, we will mark it as destroyed,
12323      * such that a subsequent call to `create` will not reuse the osd id.
12324      * This will play into being able to recreate the OSD, at the same
12325      * crush location, with minimal data movement.
12326      */
12327
12328     // make sure authmon is writeable.
12329     if (!mon.authmon()->is_writeable()) {
12330       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12331                << "osd destroy" << dendl;
12332       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12333       return false;
12334     }
12335
12336     int64_t id;
12337     if (!cmd_getval(cmdmap, "id", id)) {
12338       auto p = cmdmap.find("id");
12339       if (p == cmdmap.end()) {
12340         ss << "no osd id specified";
12341       } else {
12342         ss << "unable to parse osd id value '"
12343            << cmd_vartype_stringify(cmdmap.at("id")) << "";
12344       }
12345       err = -EINVAL;
12346       goto reply;
12347     }
12348
12349     bool is_destroy = (prefix == "osd destroy-actual");
12350     if (!is_destroy) {
12351       ceph_assert("osd purge-actual" == prefix ||
12352              "osd purge-new" == prefix);
12353     }
12354
12355     bool sure = false;
12356     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12357     if (!sure) {
12358       ss << "Are you SURE?  Did you verify with 'ceph osd safe-to-destroy'?  "
12359          << "This will mean real, permanent data loss, as well "
12360          << "as deletion of cephx and lockbox keys. "
12361          << "Pass --yes-i-really-mean-it if you really do.";
12362       err = -EPERM;
12363       goto reply;
12364     } else if (!osdmap.exists(id)) {
12365       ss << "osd." << id << " does not exist";
12366       err = 0; // idempotent
12367       goto reply;
12368     } else if (osdmap.is_up(id)) {
12369       ss << "osd." << id << " is not `down`.";
12370       err = -EBUSY;
12371       goto reply;
12372     } else if (is_destroy && osdmap.is_destroyed(id)) {
12373       ss << "destroyed osd." << id;
12374       err = 0;
12375       goto reply;
12376     }
12377
12378     if (prefix == "osd purge-new" &&
12379         (osdmap.get_state(id) & CEPH_OSD_NEW) == 0) {
12380       ss << "osd." << id << " is not new";
12381       err = -EPERM;
12382       goto reply;
12383     }
12384
12385     bool goto_reply = false;
12386
12387     paxos.plug();
12388     if (is_destroy) {
12389       err = prepare_command_osd_destroy(id, ss);
12390       // we checked above that it should exist.
12391       ceph_assert(err != -ENOENT);
12392     } else {
12393       err = prepare_command_osd_purge(id, ss);
12394       if (err == -ENOENT) {
12395         err = 0;
12396         ss << "osd." << id << " does not exist.";
12397         goto_reply = true;
12398       }
12399     }
12400     paxos.unplug();
12401
12402     if (err < 0 || goto_reply) {
12403       goto reply;
12404     }
12405
12406     if (is_destroy) {
12407       ss << "destroyed osd." << id;
12408     } else {
12409       ss << "purged osd." << id;
12410     }
12411
12412     getline(ss, rs);
12413     wait_for_finished_proposal(op,
12414         new Monitor::C_Command(mon, op, 0, rs, get_last_committed() + 1));
12415     force_immediate_propose();
12416     return true;
12417
12418   } else if (prefix == "osd new") {
12419
12420     // make sure authmon is writeable.
12421     if (!mon.authmon()->is_writeable()) {
12422       dout(10) << __func__ << " waiting for auth mon to be writeable for "
12423                << "osd new" << dendl;
12424       mon.authmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
12425       return false;
12426     }
12427
12428     map<string,string> param_map;
12429
12430     bufferlist bl = m->get_data();
12431     string param_json = bl.to_str();
12432     dout(20) << __func__ << " osd new json = " << param_json << dendl;
12433
12434     err = get_json_str_map(param_json, ss, &param_map);
12435     if (err < 0)
12436       goto reply;
12437
12438     dout(20) << __func__ << " osd new params " << param_map << dendl;
12439
12440     paxos.plug();
12441     err = prepare_command_osd_new(op, cmdmap, param_map, ss, f.get());
12442     paxos.unplug();
12443
12444     if (err < 0) {
12445       goto reply;
12446     }
12447
12448     if (f) {
12449       f->flush(rdata);
12450     } else {
12451       rdata.append(ss);
12452     }
12453
12454     if (err == EEXIST) {
12455       // idempotent operation
12456       err = 0;
12457       goto reply;
12458     }
12459
12460     wait_for_finished_proposal(op,
12461         new Monitor::C_Command(mon, op, 0, rs, rdata,
12462                                get_last_committed() + 1));
12463     force_immediate_propose();
12464     return true;
12465
12466   } else if (prefix == "osd create") {
12467
12468     // optional id provided?
12469     int64_t id = -1, cmd_id = -1;
12470     if (cmd_getval(cmdmap, "id", cmd_id)) {
12471       if (cmd_id < 0) {
12472         ss << "invalid osd id value '" << cmd_id << "'";
12473         err = -EINVAL;
12474         goto reply;
12475       }
12476       dout(10) << " osd create got id " << cmd_id << dendl;
12477     }
12478
12479     uuid_d uuid;
12480     string uuidstr;
12481     if (cmd_getval(cmdmap, "uuid", uuidstr)) {
12482       if (!uuid.parse(uuidstr.c_str())) {
12483         ss << "invalid uuid value '" << uuidstr << "'";
12484         err = -EINVAL;
12485         goto reply;
12486       }
12487       // we only care about the id if we also have the uuid, to
12488       // ensure the operation's idempotency.
12489       id = cmd_id;
12490     }
12491
12492     int32_t new_id = -1;
12493     err = prepare_command_osd_create(id, uuid, &new_id, ss);
12494     if (err < 0) {
12495       if (err == -EAGAIN) {
12496         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12497         return true;
12498       }
12499       // a check has failed; reply to the user.
12500       goto reply;
12501
12502     } else if (err == EEXIST) {
12503       // this is an idempotent operation; we can go ahead and reply.
12504       if (f) {
12505         f->open_object_section("created_osd");
12506         f->dump_int("osdid", new_id);
12507         f->close_section();
12508         f->flush(rdata);
12509       } else {
12510         ss << new_id;
12511         rdata.append(ss);
12512       }
12513       err = 0;
12514       goto reply;
12515     }
12516
12517     string empty_device_class;
12518     do_osd_create(id, uuid, empty_device_class, &new_id);
12519
12520     if (f) {
12521       f->open_object_section("created_osd");
12522       f->dump_int("osdid", new_id);
12523       f->close_section();
12524       f->flush(rdata);
12525     } else {
12526       ss << new_id;
12527       rdata.append(ss);
12528     }
12529     wait_for_finished_proposal(op,
12530         new Monitor::C_Command(mon, op, 0, rs, rdata,
12531                                get_last_committed() + 1));
12532     return true;
12533
12534   } else if (prefix == "osd blocklist clear" ||
12535              prefix == "osd blacklist clear") {
12536     pending_inc.new_blocklist.clear();
12537     std::list<std::pair<entity_addr_t,utime_t > > blocklist;
12538     osdmap.get_blocklist(&blocklist);
12539     for (const auto &entry : blocklist) {
12540       pending_inc.old_blocklist.push_back(entry.first);
12541     }
12542     ss << " removed all blocklist entries";
12543     getline(ss, rs);
12544     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12545                                               get_last_committed() + 1));
12546     return true;
12547   } else if (prefix == "osd blocklist" ||
12548              prefix == "osd blacklist") {
12549     string addrstr;
12550     cmd_getval(cmdmap, "addr", addrstr);
12551     entity_addr_t addr;
12552     if (!addr.parse(addrstr.c_str(), 0)) {
12553       ss << "unable to parse address " << addrstr;
12554       err = -EINVAL;
12555       goto reply;
12556     }
12557     else {
12558       if (osdmap.require_osd_release >= ceph_release_t::nautilus) {
12559         // always blocklist type ANY
12560         addr.set_type(entity_addr_t::TYPE_ANY);
12561       } else {
12562         addr.set_type(entity_addr_t::TYPE_LEGACY);
12563       }
12564
12565       string blocklistop;
12566       if (!cmd_getval(cmdmap, "blocklistop", blocklistop)) {
12567         cmd_getval(cmdmap, "blacklistop", blocklistop);
12568       }
12569       if (blocklistop == "add") {
12570         utime_t expires = ceph_clock_now();
12571         double d;
12572         // default one hour
12573         cmd_getval(cmdmap, "expire", d,
12574           g_conf()->mon_osd_blocklist_default_expire);
12575         expires += d;
12576
12577         pending_inc.new_blocklist[addr] = expires;
12578
12579         {
12580           // cancel any pending un-blocklisting request too
12581           auto it = std::find(pending_inc.old_blocklist.begin(),
12582             pending_inc.old_blocklist.end(), addr);
12583           if (it != pending_inc.old_blocklist.end()) {
12584             pending_inc.old_blocklist.erase(it);
12585           }
12586         }
12587
12588         ss << "blocklisting " << addr << " until " << expires << " (" << d << " sec)";
12589         getline(ss, rs);
12590         wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12591                                                   get_last_committed() + 1));
12592         return true;
12593       } else if (blocklistop == "rm") {
12594         if (osdmap.is_blocklisted(addr) ||
12595             pending_inc.new_blocklist.count(addr)) {
12596           if (osdmap.is_blocklisted(addr))
12597             pending_inc.old_blocklist.push_back(addr);
12598           else
12599             pending_inc.new_blocklist.erase(addr);
12600           ss << "un-blocklisting " << addr;
12601           getline(ss, rs);
12602           wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12603                                                     get_last_committed() + 1));
12604           return true;
12605         }
12606         ss << addr << " isn't blocklisted";
12607         err = 0;
12608         goto reply;
12609       }
12610     }
12611   } else if (prefix == "osd pool mksnap") {
12612     string poolstr;
12613     cmd_getval(cmdmap, "pool", poolstr);
12614     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12615     if (pool < 0) {
12616       ss << "unrecognized pool '" << poolstr << "'";
12617       err = -ENOENT;
12618       goto reply;
12619     }
12620     string snapname;
12621     cmd_getval(cmdmap, "snap", snapname);
12622     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12623     if (p->is_unmanaged_snaps_mode()) {
12624       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12625       err = -EINVAL;
12626       goto reply;
12627     } else if (p->snap_exists(snapname.c_str())) {
12628       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12629       err = 0;
12630       goto reply;
12631     } else if (p->is_tier()) {
12632       ss << "pool " << poolstr << " is a cache tier";
12633       err = -EINVAL;
12634       goto reply;
12635     }
12636     pg_pool_t *pp = 0;
12637     if (pending_inc.new_pools.count(pool))
12638       pp = &pending_inc.new_pools[pool];
12639     if (!pp) {
12640       pp = &pending_inc.new_pools[pool];
12641       *pp = *p;
12642     }
12643     if (pp->snap_exists(snapname.c_str())) {
12644       ss << "pool " << poolstr << " snap " << snapname << " already exists";
12645     } else {
12646       pp->add_snap(snapname.c_str(), ceph_clock_now());
12647       pp->set_snap_epoch(pending_inc.epoch);
12648       ss << "created pool " << poolstr << " snap " << snapname;
12649     }
12650     getline(ss, rs);
12651     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12652                                               get_last_committed() + 1));
12653     return true;
12654   } else if (prefix == "osd pool rmsnap") {
12655     string poolstr;
12656     cmd_getval(cmdmap, "pool", poolstr);
12657     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12658     if (pool < 0) {
12659       ss << "unrecognized pool '" << poolstr << "'";
12660       err = -ENOENT;
12661       goto reply;
12662     }
12663     string snapname;
12664     cmd_getval(cmdmap, "snap", snapname);
12665     const pg_pool_t *p = osdmap.get_pg_pool(pool);
12666     if (p->is_unmanaged_snaps_mode()) {
12667       ss << "pool " << poolstr << " is in unmanaged snaps mode";
12668       err = -EINVAL;
12669       goto reply;
12670     } else if (!p->snap_exists(snapname.c_str())) {
12671       ss << "pool " << poolstr << " snap " << snapname << " does not exist";
12672       err = 0;
12673       goto reply;
12674     }
12675     pg_pool_t *pp = 0;
12676     if (pending_inc.new_pools.count(pool))
12677       pp = &pending_inc.new_pools[pool];
12678     if (!pp) {
12679       pp = &pending_inc.new_pools[pool];
12680       *pp = *p;
12681     }
12682     snapid_t sn = pp->snap_exists(snapname.c_str());
12683     if (sn) {
12684       pp->remove_snap(sn);
12685       pp->set_snap_epoch(pending_inc.epoch);
12686       ss << "removed pool " << poolstr << " snap " << snapname;
12687     } else {
12688       ss << "already removed pool " << poolstr << " snap " << snapname;
12689     }
12690     getline(ss, rs);
12691     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12692                                               get_last_committed() + 1));
12693     return true;
12694   } else if (prefix == "osd pool create") {
12695     int64_t pg_num, pg_num_min;
12696     int64_t pgp_num;
12697     cmd_getval(cmdmap, "pg_num", pg_num, int64_t(0));
12698     cmd_getval(cmdmap, "pgp_num", pgp_num, pg_num);
12699     cmd_getval(cmdmap, "pg_num_min", pg_num_min, int64_t(0));
12700
12701     string pool_type_str;
12702     cmd_getval(cmdmap, "pool_type", pool_type_str);
12703     if (pool_type_str.empty())
12704       pool_type_str = g_conf().get_val<string>("osd_pool_default_type");
12705
12706     string poolstr;
12707     cmd_getval(cmdmap, "pool", poolstr);
12708     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12709     if (pool_id >= 0) {
12710       const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
12711       if (pool_type_str != p->get_type_name()) {
12712         ss << "pool '" << poolstr << "' cannot change to type " << pool_type_str;
12713         err = -EINVAL;
12714       } else {
12715         ss << "pool '" << poolstr << "' already exists";
12716         err = 0;
12717       }
12718       goto reply;
12719     }
12720
12721     int pool_type;
12722     if (pool_type_str == "replicated") {
12723       pool_type = pg_pool_t::TYPE_REPLICATED;
12724     } else if (pool_type_str == "erasure") {
12725       pool_type = pg_pool_t::TYPE_ERASURE;
12726     } else {
12727       ss << "unknown pool type '" << pool_type_str << "'";
12728       err = -EINVAL;
12729       goto reply;
12730     }
12731
12732     bool implicit_rule_creation = false;
12733     int64_t expected_num_objects = 0;
12734     string rule_name;
12735     cmd_getval(cmdmap, "rule", rule_name);
12736     string erasure_code_profile;
12737     cmd_getval(cmdmap, "erasure_code_profile", erasure_code_profile);
12738
12739     if (pool_type == pg_pool_t::TYPE_ERASURE) {
12740       if (erasure_code_profile == "")
12741         erasure_code_profile = "default";
12742       //handle the erasure code profile
12743       if (erasure_code_profile == "default") {
12744         if (!osdmap.has_erasure_code_profile(erasure_code_profile)) {
12745           if (pending_inc.has_erasure_code_profile(erasure_code_profile)) {
12746             dout(20) << "erasure code profile " << erasure_code_profile << " already pending" << dendl;
12747             goto wait;
12748           }
12749
12750           map<string,string> profile_map;
12751           err = osdmap.get_erasure_code_profile_default(cct,
12752                                                       profile_map,
12753                                                       &ss);
12754           if (err)
12755             goto reply;
12756           dout(20) << "erasure code profile " << erasure_code_profile << " set" << dendl;
12757           pending_inc.set_erasure_code_profile(erasure_code_profile, profile_map);
12758           goto wait;
12759         }
12760       }
12761       if (rule_name == "") {
12762         implicit_rule_creation = true;
12763         if (erasure_code_profile == "default") {
12764           rule_name = "erasure-code";
12765         } else {
12766           dout(1) << "implicitly use rule named after the pool: "
12767                 << poolstr << dendl;
12768           rule_name = poolstr;
12769         }
12770       }
12771       cmd_getval(cmdmap, "expected_num_objects",
12772                  expected_num_objects, int64_t(0));
12773     } else {
12774       //NOTE:for replicated pool,cmd_map will put rule_name to erasure_code_profile field
12775       //     and put expected_num_objects to rule field
12776       if (erasure_code_profile != "") { // cmd is from CLI
12777         if (rule_name != "") {
12778           string interr;
12779           expected_num_objects = strict_strtoll(rule_name.c_str(), 10, &interr);
12780           if (interr.length()) {
12781             ss << "error parsing integer value '" << rule_name << "': " << interr;
12782             err = -EINVAL;
12783             goto reply;
12784           }
12785         }
12786         rule_name = erasure_code_profile;
12787       } else { // cmd is well-formed
12788         cmd_getval(cmdmap, "expected_num_objects",
12789                    expected_num_objects, int64_t(0));
12790       }
12791     }
12792
12793     if (!implicit_rule_creation && rule_name != "") {
12794       int rule;
12795       err = get_crush_rule(rule_name, &rule, &ss);
12796       if (err == -EAGAIN) {
12797         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12798         return true;
12799       }
12800       if (err)
12801         goto reply;
12802     }
12803
12804     if (expected_num_objects < 0) {
12805       ss << "'expected_num_objects' must be non-negative";
12806       err = -EINVAL;
12807       goto reply;
12808     }
12809
12810     set<int32_t> osds;
12811     osdmap.get_all_osds(osds);
12812     bool has_filestore_osd = std::any_of(osds.begin(), osds.end(), [this](int osd) {
12813       string type;
12814       if (!get_osd_objectstore_type(osd, &type)) {
12815         return type == "filestore";
12816       } else {
12817         return false;
12818       }
12819     });
12820
12821     if (has_filestore_osd &&
12822         expected_num_objects > 0 &&
12823         cct->_conf->filestore_merge_threshold > 0) {
12824       ss << "'expected_num_objects' requires 'filestore_merge_threshold < 0'";
12825       err = -EINVAL;
12826       goto reply;
12827     }
12828
12829     if (has_filestore_osd &&
12830         expected_num_objects == 0 &&
12831         cct->_conf->filestore_merge_threshold < 0) {
12832       int osds = osdmap.get_num_osds();
12833       bool sure = false;
12834       cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
12835       if (!sure && osds && (pg_num >= 1024 || pg_num / osds >= 100)) {
12836         ss << "For better initial performance on pools expected to store a "
12837            << "large number of objects, consider supplying the "
12838            << "expected_num_objects parameter when creating the pool."
12839            << " Pass --yes-i-really-mean-it to ignore it";
12840         err = -EPERM;
12841         goto reply;
12842       }
12843     }
12844
12845     int64_t fast_read_param;
12846     cmd_getval(cmdmap, "fast_read", fast_read_param, int64_t(-1));
12847     FastReadType fast_read = FAST_READ_DEFAULT;
12848     if (fast_read_param == 0)
12849       fast_read = FAST_READ_OFF;
12850     else if (fast_read_param > 0)
12851       fast_read = FAST_READ_ON;
12852
12853     int64_t repl_size = 0;
12854     cmd_getval(cmdmap, "size", repl_size);
12855     int64_t target_size_bytes = 0;
12856     double target_size_ratio = 0.0;
12857     cmd_getval(cmdmap, "target_size_bytes", target_size_bytes);
12858     cmd_getval(cmdmap, "target_size_ratio", target_size_ratio);
12859
12860     string pg_autoscale_mode;
12861     cmd_getval(cmdmap, "autoscale_mode", pg_autoscale_mode);
12862
12863     err = prepare_new_pool(poolstr,
12864                            -1, // default crush rule
12865                            rule_name,
12866                            pg_num, pgp_num, pg_num_min,
12867                            repl_size, target_size_bytes, target_size_ratio,
12868                            erasure_code_profile, pool_type,
12869                            (uint64_t)expected_num_objects,
12870                            fast_read,
12871                            pg_autoscale_mode,
12872                            &ss);
12873     if (err < 0) {
12874       switch(err) {
12875       case -EEXIST:
12876         ss << "pool '" << poolstr << "' already exists";
12877         break;
12878       case -EAGAIN:
12879         wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12880         return true;
12881       case -ERANGE:
12882         goto reply;
12883       default:
12884         goto reply;
12885         break;
12886       }
12887     } else {
12888       ss << "pool '" << poolstr << "' created";
12889     }
12890     getline(ss, rs);
12891     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12892                                               get_last_committed() + 1));
12893     return true;
12894
12895   } else if (prefix == "osd pool delete" ||
12896              prefix == "osd pool rm") {
12897     // osd pool delete/rm <poolname> <poolname again> --yes-i-really-really-mean-it
12898     string poolstr, poolstr2, sure;
12899     cmd_getval(cmdmap, "pool", poolstr);
12900     cmd_getval(cmdmap, "pool2", poolstr2);
12901     int64_t pool = osdmap.lookup_pg_pool_name(poolstr.c_str());
12902     if (pool < 0) {
12903       ss << "pool '" << poolstr << "' does not exist";
12904       err = 0;
12905       goto reply;
12906     }
12907
12908     bool force_no_fake = false;
12909     cmd_getval(cmdmap, "yes_i_really_really_mean_it", force_no_fake);
12910     bool force = false;
12911     cmd_getval(cmdmap, "yes_i_really_really_mean_it_not_faking", force);
12912     if (poolstr2 != poolstr ||
12913         (!force && !force_no_fake)) {
12914       ss << "WARNING: this will *PERMANENTLY DESTROY* all data stored in pool " << poolstr
12915          << ".  If you are *ABSOLUTELY CERTAIN* that is what you want, pass the pool name *twice*, "
12916          << "followed by --yes-i-really-really-mean-it.";
12917       err = -EPERM;
12918       goto reply;
12919     }
12920     err = _prepare_remove_pool(pool, &ss, force_no_fake);
12921     if (err == -EAGAIN) {
12922       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
12923       return true;
12924     }
12925     if (err < 0)
12926       goto reply;
12927     goto update;
12928   } else if (prefix == "osd pool rename") {
12929     string srcpoolstr, destpoolstr;
12930     cmd_getval(cmdmap, "srcpool", srcpoolstr);
12931     cmd_getval(cmdmap, "destpool", destpoolstr);
12932     int64_t pool_src = osdmap.lookup_pg_pool_name(srcpoolstr.c_str());
12933     int64_t pool_dst = osdmap.lookup_pg_pool_name(destpoolstr.c_str());
12934
12935     if (pool_src < 0) {
12936       if (pool_dst >= 0) {
12937         // src pool doesn't exist, dst pool does exist: to ensure idempotency
12938         // of operations, assume this rename succeeded, as it is not changing
12939         // the current state.  Make sure we output something understandable
12940         // for whoever is issuing the command, if they are paying attention,
12941         // in case it was not intentional; or to avoid a "wtf?" and a bug
12942         // report in case it was intentional, while expecting a failure.
12943         ss << "pool '" << srcpoolstr << "' does not exist; pool '"
12944           << destpoolstr << "' does -- assuming successful rename";
12945         err = 0;
12946       } else {
12947         ss << "unrecognized pool '" << srcpoolstr << "'";
12948         err = -ENOENT;
12949       }
12950       goto reply;
12951     } else if (pool_dst >= 0) {
12952       // source pool exists and so does the destination pool
12953       ss << "pool '" << destpoolstr << "' already exists";
12954       err = -EEXIST;
12955       goto reply;
12956     }
12957
12958     int ret = _prepare_rename_pool(pool_src, destpoolstr);
12959     if (ret == 0) {
12960       ss << "pool '" << srcpoolstr << "' renamed to '" << destpoolstr << "'";
12961     } else {
12962       ss << "failed to rename pool '" << srcpoolstr << "' to '" << destpoolstr << "': "
12963         << cpp_strerror(ret);
12964     }
12965     getline(ss, rs);
12966     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, ret, rs,
12967                                               get_last_committed() + 1));
12968     return true;
12969
12970   } else if (prefix == "osd pool set") {
12971     err = prepare_command_pool_set(cmdmap, ss);
12972     if (err == -EAGAIN)
12973       goto wait;
12974     if (err < 0)
12975       goto reply;
12976
12977     getline(ss, rs);
12978     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
12979                                                    get_last_committed() + 1));
12980     return true;
12981   } else if (prefix == "osd tier add") {
12982     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
12983     if (err == -EAGAIN)
12984       goto wait;
12985     if (err)
12986       goto reply;
12987     string poolstr;
12988     cmd_getval(cmdmap, "pool", poolstr);
12989     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
12990     if (pool_id < 0) {
12991       ss << "unrecognized pool '" << poolstr << "'";
12992       err = -ENOENT;
12993       goto reply;
12994     }
12995     string tierpoolstr;
12996     cmd_getval(cmdmap, "tierpool", tierpoolstr);
12997     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
12998     if (tierpool_id < 0) {
12999       ss << "unrecognized pool '" << tierpoolstr << "'";
13000       err = -ENOENT;
13001       goto reply;
13002     }
13003     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13004     ceph_assert(p);
13005     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13006     ceph_assert(tp);
13007
13008     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13009       goto reply;
13010     }
13011
13012     // make sure new tier is empty
13013     string force_nonempty;
13014     cmd_getval(cmdmap, "force_nonempty", force_nonempty);
13015     const pool_stat_t *pstats = mon.mgrstatmon()->get_pool_stat(tierpool_id);
13016     if (pstats && pstats->stats.sum.num_objects != 0 &&
13017         force_nonempty != "--force-nonempty") {
13018       ss << "tier pool '" << tierpoolstr << "' is not empty; --force-nonempty to force";
13019       err = -ENOTEMPTY;
13020       goto reply;
13021     }
13022     if (tp->is_erasure()) {
13023       ss << "tier pool '" << tierpoolstr
13024          << "' is an ec pool, which cannot be a tier";
13025       err = -ENOTSUP;
13026       goto reply;
13027     }
13028     if ((!tp->removed_snaps.empty() || !tp->snaps.empty()) &&
13029         ((force_nonempty != "--force-nonempty") ||
13030          (!g_conf()->mon_debug_unsafe_allow_tier_with_nonempty_snaps))) {
13031       ss << "tier pool '" << tierpoolstr << "' has snapshot state; it cannot be added as a tier without breaking the pool";
13032       err = -ENOTEMPTY;
13033       goto reply;
13034     }
13035     // go
13036     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13037     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13038     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13039       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13040       return true;
13041     }
13042     np->tiers.insert(tierpool_id);
13043     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13044     ntp->tier_of = pool_id;
13045     ss << "pool '" << tierpoolstr << "' is now (or already was) a tier of '" << poolstr << "'";
13046     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13047                                               get_last_committed() + 1));
13048     return true;
13049   } else if (prefix == "osd tier remove" ||
13050              prefix == "osd tier rm") {
13051     string poolstr;
13052     cmd_getval(cmdmap, "pool", poolstr);
13053     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13054     if (pool_id < 0) {
13055       ss << "unrecognized pool '" << poolstr << "'";
13056       err = -ENOENT;
13057       goto reply;
13058     }
13059     string tierpoolstr;
13060     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13061     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13062     if (tierpool_id < 0) {
13063       ss << "unrecognized pool '" << tierpoolstr << "'";
13064       err = -ENOENT;
13065       goto reply;
13066     }
13067     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13068     ceph_assert(p);
13069     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13070     ceph_assert(tp);
13071
13072     if (!_check_remove_tier(pool_id, p, tp, &err, &ss)) {
13073       goto reply;
13074     }
13075
13076     if (p->tiers.count(tierpool_id) == 0) {
13077       ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13078       err = 0;
13079       goto reply;
13080     }
13081     if (tp->tier_of != pool_id) {
13082       ss << "tier pool '" << tierpoolstr << "' is a tier of '"
13083          << osdmap.get_pool_name(tp->tier_of) << "': "
13084          // be scary about it; this is an inconsistency and bells must go off
13085          << "THIS SHOULD NOT HAVE HAPPENED AT ALL";
13086       err = -EINVAL;
13087       goto reply;
13088     }
13089     if (p->read_tier == tierpool_id) {
13090       ss << "tier pool '" << tierpoolstr << "' is the overlay for '" << poolstr << "'; please remove-overlay first";
13091       err = -EBUSY;
13092       goto reply;
13093     }
13094     // go
13095     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13096     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13097     if (np->tiers.count(tierpool_id) == 0 ||
13098         ntp->tier_of != pool_id ||
13099         np->read_tier == tierpool_id) {
13100       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13101       return true;
13102     }
13103     np->tiers.erase(tierpool_id);
13104     ntp->clear_tier();
13105     ss << "pool '" << tierpoolstr << "' is now (or already was) not a tier of '" << poolstr << "'";
13106     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13107                                               get_last_committed() + 1));
13108     return true;
13109   } else if (prefix == "osd tier set-overlay") {
13110     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13111     if (err == -EAGAIN)
13112       goto wait;
13113     if (err)
13114       goto reply;
13115     string poolstr;
13116     cmd_getval(cmdmap, "pool", poolstr);
13117     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13118     if (pool_id < 0) {
13119       ss << "unrecognized pool '" << poolstr << "'";
13120       err = -ENOENT;
13121       goto reply;
13122     }
13123     string overlaypoolstr;
13124     cmd_getval(cmdmap, "overlaypool", overlaypoolstr);
13125     int64_t overlaypool_id = osdmap.lookup_pg_pool_name(overlaypoolstr);
13126     if (overlaypool_id < 0) {
13127       ss << "unrecognized pool '" << overlaypoolstr << "'";
13128       err = -ENOENT;
13129       goto reply;
13130     }
13131     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13132     ceph_assert(p);
13133     const pg_pool_t *overlay_p = osdmap.get_pg_pool(overlaypool_id);
13134     ceph_assert(overlay_p);
13135     if (p->tiers.count(overlaypool_id) == 0) {
13136       ss << "tier pool '" << overlaypoolstr << "' is not a tier of '" << poolstr << "'";
13137       err = -EINVAL;
13138       goto reply;
13139     }
13140     if (p->read_tier == overlaypool_id) {
13141       err = 0;
13142       ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13143       goto reply;
13144     }
13145     if (p->has_read_tier()) {
13146       ss << "pool '" << poolstr << "' has overlay '"
13147          << osdmap.get_pool_name(p->read_tier)
13148          << "'; please remove-overlay first";
13149       err = -EINVAL;
13150       goto reply;
13151     }
13152
13153     // go
13154     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13155     np->read_tier = overlaypool_id;
13156     np->write_tier = overlaypool_id;
13157     np->set_last_force_op_resend(pending_inc.epoch);
13158     pg_pool_t *noverlay_p = pending_inc.get_new_pool(overlaypool_id, overlay_p);
13159     noverlay_p->set_last_force_op_resend(pending_inc.epoch);
13160     ss << "overlay for '" << poolstr << "' is now (or already was) '" << overlaypoolstr << "'";
13161     if (overlay_p->cache_mode == pg_pool_t::CACHEMODE_NONE)
13162       ss <<" (WARNING: overlay pool cache_mode is still NONE)";
13163     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13164                                               get_last_committed() + 1));
13165     return true;
13166   } else if (prefix == "osd tier remove-overlay" ||
13167              prefix == "osd tier rm-overlay") {
13168     string poolstr;
13169     cmd_getval(cmdmap, "pool", poolstr);
13170     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13171     if (pool_id < 0) {
13172       ss << "unrecognized pool '" << poolstr << "'";
13173       err = -ENOENT;
13174       goto reply;
13175     }
13176     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13177     ceph_assert(p);
13178     if (!p->has_read_tier()) {
13179       err = 0;
13180       ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13181       goto reply;
13182     }
13183
13184     if (!_check_remove_tier(pool_id, p, NULL, &err, &ss)) {
13185       goto reply;
13186     }
13187
13188     // go
13189     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13190     if (np->has_read_tier()) {
13191       const pg_pool_t *op = osdmap.get_pg_pool(np->read_tier);
13192       pg_pool_t *nop = pending_inc.get_new_pool(np->read_tier,op);
13193       nop->set_last_force_op_resend(pending_inc.epoch);
13194     }
13195     if (np->has_write_tier()) {
13196       const pg_pool_t *op = osdmap.get_pg_pool(np->write_tier);
13197       pg_pool_t *nop = pending_inc.get_new_pool(np->write_tier, op);
13198       nop->set_last_force_op_resend(pending_inc.epoch);
13199     }
13200     np->clear_read_tier();
13201     np->clear_write_tier();
13202     np->set_last_force_op_resend(pending_inc.epoch);
13203     ss << "there is now (or already was) no overlay for '" << poolstr << "'";
13204     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13205                                               get_last_committed() + 1));
13206     return true;
13207   } else if (prefix == "osd tier cache-mode") {
13208     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13209     if (err == -EAGAIN)
13210       goto wait;
13211     if (err)
13212       goto reply;
13213     string poolstr;
13214     cmd_getval(cmdmap, "pool", poolstr);
13215     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13216     if (pool_id < 0) {
13217       ss << "unrecognized pool '" << poolstr << "'";
13218       err = -ENOENT;
13219       goto reply;
13220     }
13221     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13222     ceph_assert(p);
13223     if (!p->is_tier()) {
13224       ss << "pool '" << poolstr << "' is not a tier";
13225       err = -EINVAL;
13226       goto reply;
13227     }
13228     string modestr;
13229     cmd_getval(cmdmap, "mode", modestr);
13230     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13231     if (int(mode) < 0) {
13232       ss << "'" << modestr << "' is not a valid cache mode";
13233       err = -EINVAL;
13234       goto reply;
13235     }
13236
13237     bool sure = false;
13238     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13239
13240     if (mode == pg_pool_t::CACHEMODE_FORWARD ||
13241         mode == pg_pool_t::CACHEMODE_READFORWARD) {
13242       ss << "'" << modestr << "' is no longer a supported cache mode";
13243       err = -EPERM;
13244       goto reply;
13245     }
13246     if ((mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13247          mode != pg_pool_t::CACHEMODE_NONE &&
13248          mode != pg_pool_t::CACHEMODE_PROXY &&
13249          mode != pg_pool_t::CACHEMODE_READPROXY) &&
13250          !sure) {
13251       ss << "'" << modestr << "' is not a well-supported cache mode and may "
13252          << "corrupt your data.  pass --yes-i-really-mean-it to force.";
13253       err = -EPERM;
13254       goto reply;
13255     }
13256
13257     // pool already has this cache-mode set and there are no pending changes
13258     if (p->cache_mode == mode &&
13259         (pending_inc.new_pools.count(pool_id) == 0 ||
13260          pending_inc.new_pools[pool_id].cache_mode == p->cache_mode)) {
13261       ss << "set cache-mode for pool '" << poolstr << "'"
13262          << " to " << pg_pool_t::get_cache_mode_name(mode);
13263       err = 0;
13264       goto reply;
13265     }
13266
13267     /* Mode description:
13268      *
13269      *  none:       No cache-mode defined
13270      *  forward:    Forward all reads and writes to base pool [removed]
13271      *  writeback:  Cache writes, promote reads from base pool
13272      *  readonly:   Forward writes to base pool
13273      *  readforward: Writes are in writeback mode, Reads are in forward mode [removed]
13274      *  proxy:       Proxy all reads and writes to base pool
13275      *  readproxy:   Writes are in writeback mode, Reads are in proxy mode
13276      *
13277      * Hence, these are the allowed transitions:
13278      *
13279      *  none -> any
13280      *  forward -> proxy || readforward || readproxy || writeback || any IF num_objects_dirty == 0
13281      *  proxy -> readproxy || writeback || any IF num_objects_dirty == 0
13282      *  readforward -> forward || proxy || readproxy || writeback || any IF num_objects_dirty == 0
13283      *  readproxy -> proxy || writeback || any IF num_objects_dirty == 0
13284      *  writeback -> readproxy || proxy
13285      *  readonly -> any
13286      */
13287
13288     // We check if the transition is valid against the current pool mode, as
13289     // it is the only committed state thus far.  We will blantly squash
13290     // whatever mode is on the pending state.
13291
13292     if (p->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
13293         (mode != pg_pool_t::CACHEMODE_PROXY &&
13294           mode != pg_pool_t::CACHEMODE_READPROXY)) {
13295       ss << "unable to set cache-mode '" << pg_pool_t::get_cache_mode_name(mode)
13296          << "' on a '" << pg_pool_t::get_cache_mode_name(p->cache_mode)
13297          << "' pool; only '"
13298          << pg_pool_t::get_cache_mode_name(pg_pool_t::CACHEMODE_READPROXY)
13299         << "' allowed.";
13300       err = -EINVAL;
13301       goto reply;
13302     }
13303     if ((p->cache_mode == pg_pool_t::CACHEMODE_READFORWARD &&
13304         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13305           mode != pg_pool_t::CACHEMODE_PROXY &&
13306           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13307
13308         (p->cache_mode == pg_pool_t::CACHEMODE_READPROXY &&
13309         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13310           mode != pg_pool_t::CACHEMODE_PROXY)) ||
13311
13312         (p->cache_mode == pg_pool_t::CACHEMODE_PROXY &&
13313         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13314           mode != pg_pool_t::CACHEMODE_READPROXY)) ||
13315
13316         (p->cache_mode == pg_pool_t::CACHEMODE_FORWARD &&
13317         (mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13318           mode != pg_pool_t::CACHEMODE_PROXY &&
13319           mode != pg_pool_t::CACHEMODE_READPROXY))) {
13320
13321       const pool_stat_t* pstats =
13322         mon.mgrstatmon()->get_pool_stat(pool_id);
13323
13324       if (pstats && pstats->stats.sum.num_objects_dirty > 0) {
13325         ss << "unable to set cache-mode '"
13326            << pg_pool_t::get_cache_mode_name(mode) << "' on pool '" << poolstr
13327            << "': dirty objects found";
13328         err = -EBUSY;
13329         goto reply;
13330       }
13331     }
13332     // go
13333     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13334     np->cache_mode = mode;
13335     // set this both when moving to and from cache_mode NONE.  this is to
13336     // capture legacy pools that were set up before this flag existed.
13337     np->flags |= pg_pool_t::FLAG_INCOMPLETE_CLONES;
13338     ss << "set cache-mode for pool '" << poolstr
13339         << "' to " << pg_pool_t::get_cache_mode_name(mode);
13340     if (mode == pg_pool_t::CACHEMODE_NONE) {
13341       const pg_pool_t *base_pool = osdmap.get_pg_pool(np->tier_of);
13342       ceph_assert(base_pool);
13343       if (base_pool->read_tier == pool_id ||
13344           base_pool->write_tier == pool_id)
13345         ss <<" (WARNING: pool is still configured as read or write tier)";
13346     }
13347     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13348                                               get_last_committed() + 1));
13349     return true;
13350   } else if (prefix == "osd tier add-cache") {
13351     err = check_cluster_features(CEPH_FEATURE_OSD_CACHEPOOL, ss);
13352     if (err == -EAGAIN)
13353       goto wait;
13354     if (err)
13355       goto reply;
13356     string poolstr;
13357     cmd_getval(cmdmap, "pool", poolstr);
13358     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13359     if (pool_id < 0) {
13360       ss << "unrecognized pool '" << poolstr << "'";
13361       err = -ENOENT;
13362       goto reply;
13363     }
13364     string tierpoolstr;
13365     cmd_getval(cmdmap, "tierpool", tierpoolstr);
13366     int64_t tierpool_id = osdmap.lookup_pg_pool_name(tierpoolstr);
13367     if (tierpool_id < 0) {
13368       ss << "unrecognized pool '" << tierpoolstr << "'";
13369       err = -ENOENT;
13370       goto reply;
13371     }
13372     const pg_pool_t *p = osdmap.get_pg_pool(pool_id);
13373     ceph_assert(p);
13374     const pg_pool_t *tp = osdmap.get_pg_pool(tierpool_id);
13375     ceph_assert(tp);
13376
13377     if (!_check_become_tier(tierpool_id, tp, pool_id, p, &err, &ss)) {
13378       goto reply;
13379     }
13380
13381     int64_t size = 0;
13382     if (!cmd_getval(cmdmap, "size", size)) {
13383       ss << "unable to parse 'size' value '"
13384          << cmd_vartype_stringify(cmdmap.at("size")) << "'";
13385       err = -EINVAL;
13386       goto reply;
13387     }
13388     // make sure new tier is empty
13389     const pool_stat_t *pstats =
13390       mon.mgrstatmon()->get_pool_stat(tierpool_id);
13391     if (pstats && pstats->stats.sum.num_objects != 0) {
13392       ss << "tier pool '" << tierpoolstr << "' is not empty";
13393       err = -ENOTEMPTY;
13394       goto reply;
13395     }
13396     auto& modestr = g_conf().get_val<string>("osd_tier_default_cache_mode");
13397     pg_pool_t::cache_mode_t mode = pg_pool_t::get_cache_mode_from_str(modestr);
13398     if (int(mode) < 0) {
13399       ss << "osd tier cache default mode '" << modestr << "' is not a valid cache mode";
13400       err = -EINVAL;
13401       goto reply;
13402     }
13403     HitSet::Params hsp;
13404     auto& cache_hit_set_type =
13405       g_conf().get_val<string>("osd_tier_default_cache_hit_set_type");
13406     if (cache_hit_set_type == "bloom") {
13407       BloomHitSet::Params *bsp = new BloomHitSet::Params;
13408       bsp->set_fpp(g_conf().get_val<double>("osd_pool_default_hit_set_bloom_fpp"));
13409       hsp = HitSet::Params(bsp);
13410     } else if (cache_hit_set_type == "explicit_hash") {
13411       hsp = HitSet::Params(new ExplicitHashHitSet::Params);
13412     } else if (cache_hit_set_type == "explicit_object") {
13413       hsp = HitSet::Params(new ExplicitObjectHitSet::Params);
13414     } else {
13415       ss << "osd tier cache default hit set type '"
13416          << cache_hit_set_type << "' is not a known type";
13417       err = -EINVAL;
13418       goto reply;
13419     }
13420     // go
13421     pg_pool_t *np = pending_inc.get_new_pool(pool_id, p);
13422     pg_pool_t *ntp = pending_inc.get_new_pool(tierpool_id, tp);
13423     if (np->tiers.count(tierpool_id) || ntp->is_tier()) {
13424       wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13425       return true;
13426     }
13427     np->tiers.insert(tierpool_id);
13428     np->read_tier = np->write_tier = tierpool_id;
13429     np->set_snap_epoch(pending_inc.epoch); // tier will update to our snap info
13430     np->set_last_force_op_resend(pending_inc.epoch);
13431     ntp->set_last_force_op_resend(pending_inc.epoch);
13432     ntp->tier_of = pool_id;
13433     ntp->cache_mode = mode;
13434     ntp->hit_set_count = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_count");
13435     ntp->hit_set_period = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_period");
13436     ntp->min_read_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_read_recency_for_promote");
13437     ntp->min_write_recency_for_promote = g_conf().get_val<uint64_t>("osd_tier_default_cache_min_write_recency_for_promote");
13438     ntp->hit_set_grade_decay_rate = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_grade_decay_rate");
13439     ntp->hit_set_search_last_n = g_conf().get_val<uint64_t>("osd_tier_default_cache_hit_set_search_last_n");
13440     ntp->hit_set_params = hsp;
13441     ntp->target_max_bytes = size;
13442     ss << "pool '" << tierpoolstr << "' is now (or already was) a cache tier of '" << poolstr << "'";
13443     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, ss.str(),
13444                                               get_last_committed() + 1));
13445     return true;
13446   } else if (prefix == "osd pool set-quota") {
13447     string poolstr;
13448     cmd_getval(cmdmap, "pool", poolstr);
13449     int64_t pool_id = osdmap.lookup_pg_pool_name(poolstr);
13450     if (pool_id < 0) {
13451       ss << "unrecognized pool '" << poolstr << "'";
13452       err = -ENOENT;
13453       goto reply;
13454     }
13455
13456     string field;
13457     cmd_getval(cmdmap, "field", field);
13458     if (field != "max_objects" && field != "max_bytes") {
13459       ss << "unrecognized field '" << field << "'; should be 'max_bytes' or 'max_objects'";
13460       err = -EINVAL;
13461       goto reply;
13462     }
13463
13464     // val could contain unit designations, so we treat as a string
13465     string val;
13466     cmd_getval(cmdmap, "val", val);
13467     string tss;
13468     int64_t value;
13469     if (field == "max_objects") {
13470       value = strict_sistrtoll(val.c_str(), &tss);
13471     } else if (field == "max_bytes") {
13472       value = strict_iecstrtoll(val.c_str(), &tss);
13473     } else {
13474       ceph_abort_msg("unrecognized option");
13475     }
13476     if (!tss.empty()) {
13477       ss << "error parsing value '" << val << "': " << tss;
13478       err = -EINVAL;
13479       goto reply;
13480     }
13481
13482     pg_pool_t *pi = pending_inc.get_new_pool(pool_id, osdmap.get_pg_pool(pool_id));
13483     if (field == "max_objects") {
13484       pi->quota_max_objects = value;
13485     } else if (field == "max_bytes") {
13486       pi->quota_max_bytes = value;
13487     } else {
13488       ceph_abort_msg("unrecognized option");
13489     }
13490     ss << "set-quota " << field << " = " << value << " for pool " << poolstr;
13491     rs = ss.str();
13492     wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13493                                               get_last_committed() + 1));
13494     return true;
13495   } else if (prefix == "osd pool application enable" ||
13496              prefix == "osd pool application disable" ||
13497              prefix == "osd pool application set" ||
13498              prefix == "osd pool application rm") {
13499     err = prepare_command_pool_application(prefix, cmdmap, ss);
13500     if (err == -EAGAIN) {
13501       goto wait;
13502     } else if (err < 0) {
13503       goto reply;
13504     } else {
13505       goto update;
13506     }
13507   } else if (prefix == "osd force-create-pg") {
13508     pg_t pgid;
13509     string pgidstr;
13510     cmd_getval(cmdmap, "pgid", pgidstr);
13511     if (!pgid.parse(pgidstr.c_str())) {
13512       ss << "invalid pgid '" << pgidstr << "'";
13513       err = -EINVAL;
13514       goto reply;
13515     }
13516     if (!osdmap.pg_exists(pgid)) {
13517       ss << "pg " << pgid << " should not exist";
13518       err = -ENOENT;
13519       goto reply;
13520     }
13521     bool sure = false;
13522     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13523     if (!sure) {
13524       ss << "This command will recreate a lost (as in data lost) PG with data in it, such "
13525          << "that the cluster will give up ever trying to recover the lost data.  Do this "
13526          << "only if you are certain that all copies of the PG are in fact lost and you are "
13527          << "willing to accept that the data is permanently destroyed.  Pass "
13528          << "--yes-i-really-mean-it to proceed.";
13529       err = -EPERM;
13530       goto reply;
13531     }
13532     bool creating_now;
13533     {
13534       std::lock_guard<std::mutex> l(creating_pgs_lock);
13535       auto emplaced = creating_pgs.pgs.emplace(
13536         pgid,
13537         creating_pgs_t::pg_create_info(osdmap.get_epoch(),
13538                                        ceph_clock_now()));
13539       creating_now = emplaced.second;
13540     }
13541     if (creating_now) {
13542       ss << "pg " << pgidstr << " now creating, ok";
13543       // set the pool's CREATING flag so that (1) the osd won't ignore our
13544       // create message and (2) we won't propose any future pg_num changes
13545       // until after the PG has been instantiated.
13546       if (pending_inc.new_pools.count(pgid.pool()) == 0) {
13547         pending_inc.new_pools[pgid.pool()] = *osdmap.get_pg_pool(pgid.pool());
13548       }
13549       pending_inc.new_pools[pgid.pool()].flags |= pg_pool_t::FLAG_CREATING;
13550       err = 0;
13551       goto update;
13552     } else {
13553       ss << "pg " << pgid << " already creating";
13554       err = 0;
13555       goto reply;
13556     }
13557   } else if (prefix == "osd force_healthy_stretch_mode") {
13558     bool sure = false;
13559     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13560     if (!sure) {
13561       ss << "This command will require peering across multiple CRUSH buckets "
13562         "(probably two data centers or availability zones?) and may result in PGs "
13563         "going inactive until backfilling is complete. Pass --yes-i-really-mean-it to proceed.";
13564       err = -EPERM;
13565       goto reply;
13566     }
13567     try_end_recovery_stretch_mode(true);
13568     ss << "Triggering healthy stretch mode";
13569     err = 0;
13570     goto reply;
13571   } else if (prefix == "osd force_recovery_stretch_mode") {
13572     bool sure = false;
13573     cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
13574     if (!sure) {
13575       ss << "This command will increase pool sizes to try and spread them "
13576         "across multiple CRUSH buckets (probably two data centers or "
13577         "availability zones?) and should have happened automatically"
13578         "Pass --yes-i-really-mean-it to proceed.";
13579       err = -EPERM;
13580       goto reply;
13581     }
13582     mon.go_recovery_stretch_mode();
13583     ss << "Triggering recovery stretch mode";
13584     err = 0;
13585     goto reply;
13586   } else {
13587     err = -EINVAL;
13588   }
13589
13590  reply:
13591   getline(ss, rs);
13592   if (err < 0 && rs.length() == 0)
13593     rs = cpp_strerror(err);
13594   mon.reply_command(op, err, rs, rdata, get_last_committed());
13595   return ret;
13596
13597  update:
13598   getline(ss, rs);
13599   wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, 0, rs,
13600                                             get_last_committed() + 1));
13601   return true;
13602
13603  wait:
13604   wait_for_finished_proposal(op, new C_RetryMessage(this, op));
13605   return true;
13606 }
13607
13608 bool OSDMonitor::enforce_pool_op_caps(MonOpRequestRef op)
13609 {
13610   op->mark_osdmon_event(__func__);
13611
13612   auto m = op->get_req<MPoolOp>();
13613   MonSession *session = op->get_session();
13614   if (!session) {
13615     _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13616     return true;
13617   }
13618
13619   switch (m->op) {
13620   case POOL_OP_CREATE_UNMANAGED_SNAP:
13621   case POOL_OP_DELETE_UNMANAGED_SNAP:
13622     {
13623       const std::string* pool_name = nullptr;
13624       const pg_pool_t *pg_pool = osdmap.get_pg_pool(m->pool);
13625       if (pg_pool != nullptr) {
13626         pool_name = &osdmap.get_pool_name(m->pool);
13627       }
13628
13629       if (!is_unmanaged_snap_op_permitted(cct, mon.key_server,
13630                                           session->entity_name, session->caps,
13631                                           session->get_peer_socket_addr(),
13632                                           pool_name)) {
13633         dout(0) << "got unmanaged-snap pool op from entity with insufficient "
13634                 << "privileges. message: " << *m  << std::endl
13635                 << "caps: " << session->caps << dendl;
13636         _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13637         return true;
13638       }
13639     }
13640     break;
13641   default:
13642     if (!session->is_capable("osd", MON_CAP_W)) {
13643       dout(0) << "got pool op from entity with insufficient privileges. "
13644               << "message: " << *m  << std::endl
13645               << "caps: " << session->caps << dendl;
13646       _pool_op_reply(op, -EPERM, osdmap.get_epoch());
13647       return true;
13648     }
13649     break;
13650   }
13651
13652   return false;
13653 }
13654
13655 bool OSDMonitor::preprocess_pool_op(MonOpRequestRef op)
13656 {
13657   op->mark_osdmon_event(__func__);
13658   auto m = op->get_req<MPoolOp>();
13659
13660   if (enforce_pool_op_caps(op)) {
13661     return true;
13662   }
13663
13664   if (m->fsid != mon.monmap->fsid) {
13665     dout(0) << __func__ << " drop message on fsid " << m->fsid
13666             << " != " << mon.monmap->fsid << " for " << *m << dendl;
13667     _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13668     return true;
13669   }
13670
13671   if (m->op == POOL_OP_CREATE)
13672     return preprocess_pool_op_create(op);
13673
13674   const pg_pool_t *p = osdmap.get_pg_pool(m->pool);
13675   if (p == nullptr) {
13676     dout(10) << "attempt to operate on non-existent pool id " << m->pool << dendl;
13677     if (m->op == POOL_OP_DELETE) {
13678       _pool_op_reply(op, 0, osdmap.get_epoch());
13679     } else {
13680       _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13681     }
13682     return true;
13683   }
13684
13685   // check if the snap and snapname exist
13686   bool snap_exists = false;
13687   if (p->snap_exists(m->name.c_str()))
13688     snap_exists = true;
13689
13690   switch (m->op) {
13691   case POOL_OP_CREATE_SNAP:
13692     if (p->is_unmanaged_snaps_mode() || p->is_tier()) {
13693       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13694       return true;
13695     }
13696     if (snap_exists) {
13697       _pool_op_reply(op, 0, osdmap.get_epoch());
13698       return true;
13699     }
13700     return false;
13701   case POOL_OP_CREATE_UNMANAGED_SNAP:
13702     if (p->is_pool_snaps_mode()) {
13703       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13704       return true;
13705     }
13706     return false;
13707   case POOL_OP_DELETE_SNAP:
13708     if (p->is_unmanaged_snaps_mode()) {
13709       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13710       return true;
13711     }
13712     if (!snap_exists) {
13713       _pool_op_reply(op, 0, osdmap.get_epoch());
13714       return true;
13715     }
13716     return false;
13717   case POOL_OP_DELETE_UNMANAGED_SNAP:
13718     if (p->is_pool_snaps_mode()) {
13719       _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13720       return true;
13721     }
13722     if (_is_removed_snap(m->pool, m->snapid)) {
13723       _pool_op_reply(op, 0, osdmap.get_epoch());
13724       return true;
13725     }
13726     return false;
13727   case POOL_OP_DELETE:
13728     if (osdmap.lookup_pg_pool_name(m->name.c_str()) >= 0) {
13729       _pool_op_reply(op, 0, osdmap.get_epoch());
13730       return true;
13731     }
13732     return false;
13733   case POOL_OP_AUID_CHANGE:
13734     return false;
13735   default:
13736     ceph_abort();
13737     break;
13738   }
13739
13740   return false;
13741 }
13742
13743 bool OSDMonitor::_is_removed_snap(int64_t pool, snapid_t snap)
13744 {
13745   if (!osdmap.have_pg_pool(pool)) {
13746     dout(10) << __func__ << " pool " << pool << " snap " << snap
13747              << " - pool dne" << dendl;
13748     return true;
13749   }
13750   if (osdmap.in_removed_snaps_queue(pool, snap)) {
13751     dout(10) << __func__ << " pool " << pool << " snap " << snap
13752              << " - in osdmap removed_snaps_queue" << dendl;
13753     return true;
13754   }
13755   snapid_t begin, end;
13756   int r = lookup_purged_snap(pool, snap, &begin, &end);
13757   if (r == 0) {
13758     dout(10) << __func__ << " pool " << pool << " snap " << snap
13759              << " - purged, [" << begin << "," << end << ")" << dendl;
13760     return true;
13761   }
13762   return false;
13763 }
13764
13765 bool OSDMonitor::_is_pending_removed_snap(int64_t pool, snapid_t snap)
13766 {
13767   if (pending_inc.old_pools.count(pool)) {
13768     dout(10) << __func__ << " pool " << pool << " snap " << snap
13769              << " - pool pending deletion" << dendl;
13770     return true;
13771   }
13772   if (pending_inc.in_new_removed_snaps(pool, snap)) {
13773     dout(10) << __func__ << " pool " << pool << " snap " << snap
13774              << " - in pending new_removed_snaps" << dendl;
13775     return true;
13776   }
13777   return false;
13778 }
13779
13780 bool OSDMonitor::preprocess_pool_op_create(MonOpRequestRef op)
13781 {
13782   op->mark_osdmon_event(__func__);
13783   auto m = op->get_req<MPoolOp>();
13784   int64_t pool = osdmap.lookup_pg_pool_name(m->name.c_str());
13785   if (pool >= 0) {
13786     _pool_op_reply(op, 0, osdmap.get_epoch());
13787     return true;
13788   }
13789
13790   return false;
13791 }
13792
13793 bool OSDMonitor::prepare_pool_op(MonOpRequestRef op)
13794 {
13795   op->mark_osdmon_event(__func__);
13796   auto m = op->get_req<MPoolOp>();
13797   dout(10) << "prepare_pool_op " << *m << dendl;
13798   if (m->op == POOL_OP_CREATE) {
13799     return prepare_pool_op_create(op);
13800   } else if (m->op == POOL_OP_DELETE) {
13801     return prepare_pool_op_delete(op);
13802   }
13803
13804   int ret = 0;
13805   bool changed = false;
13806
13807   if (!osdmap.have_pg_pool(m->pool)) {
13808     _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13809     return false;
13810   }
13811
13812   const pg_pool_t *pool = osdmap.get_pg_pool(m->pool);
13813
13814   switch (m->op) {
13815     case POOL_OP_CREATE_SNAP:
13816       if (pool->is_tier()) {
13817         ret = -EINVAL;
13818         _pool_op_reply(op, ret, osdmap.get_epoch());
13819         return false;
13820       }  // else, fall through
13821     case POOL_OP_DELETE_SNAP:
13822       if (!pool->is_unmanaged_snaps_mode()) {
13823         bool snap_exists = pool->snap_exists(m->name.c_str());
13824         if ((m->op == POOL_OP_CREATE_SNAP && snap_exists)
13825           || (m->op == POOL_OP_DELETE_SNAP && !snap_exists)) {
13826           ret = 0;
13827         } else {
13828           break;
13829         }
13830       } else {
13831         ret = -EINVAL;
13832       }
13833       _pool_op_reply(op, ret, osdmap.get_epoch());
13834       return false;
13835
13836     case POOL_OP_DELETE_UNMANAGED_SNAP:
13837       // we won't allow removal of an unmanaged snapshot from a pool
13838       // not in unmanaged snaps mode.
13839       if (!pool->is_unmanaged_snaps_mode()) {
13840         _pool_op_reply(op, -ENOTSUP, osdmap.get_epoch());
13841         return false;
13842       }
13843       /* fall-thru */
13844     case POOL_OP_CREATE_UNMANAGED_SNAP:
13845       // but we will allow creating an unmanaged snapshot on any pool
13846       // as long as it is not in 'pool' snaps mode.
13847       if (pool->is_pool_snaps_mode()) {
13848         _pool_op_reply(op, -EINVAL, osdmap.get_epoch());
13849         return false;
13850       }
13851   }
13852
13853   // projected pool info
13854   pg_pool_t pp;
13855   if (pending_inc.new_pools.count(m->pool))
13856     pp = pending_inc.new_pools[m->pool];
13857   else
13858     pp = *osdmap.get_pg_pool(m->pool);
13859
13860   bufferlist reply_data;
13861
13862   // pool snaps vs unmanaged snaps are mutually exclusive
13863   switch (m->op) {
13864   case POOL_OP_CREATE_SNAP:
13865   case POOL_OP_DELETE_SNAP:
13866     if (pp.is_unmanaged_snaps_mode()) {
13867       ret = -EINVAL;
13868       goto out;
13869     }
13870     break;
13871
13872   case POOL_OP_CREATE_UNMANAGED_SNAP:
13873   case POOL_OP_DELETE_UNMANAGED_SNAP:
13874     if (pp.is_pool_snaps_mode()) {
13875       ret = -EINVAL;
13876       goto out;
13877     }
13878   }
13879
13880   switch (m->op) {
13881   case POOL_OP_CREATE_SNAP:
13882     if (!pp.snap_exists(m->name.c_str())) {
13883       pp.add_snap(m->name.c_str(), ceph_clock_now());
13884       dout(10) << "create snap in pool " << m->pool << " " << m->name
13885                << " seq " << pp.get_snap_epoch() << dendl;
13886       changed = true;
13887     }
13888     break;
13889
13890   case POOL_OP_DELETE_SNAP:
13891     {
13892       snapid_t s = pp.snap_exists(m->name.c_str());
13893       if (s) {
13894         pp.remove_snap(s);
13895         pending_inc.new_removed_snaps[m->pool].insert(s);
13896         changed = true;
13897       }
13898     }
13899     break;
13900
13901   case POOL_OP_CREATE_UNMANAGED_SNAP:
13902     {
13903       uint64_t snapid = pp.add_unmanaged_snap(
13904         osdmap.require_osd_release < ceph_release_t::octopus);
13905       encode(snapid, reply_data);
13906       changed = true;
13907     }
13908     break;
13909
13910   case POOL_OP_DELETE_UNMANAGED_SNAP:
13911     if (!_is_removed_snap(m->pool, m->snapid) &&
13912         !_is_pending_removed_snap(m->pool, m->snapid)) {
13913       if (m->snapid > pp.get_snap_seq()) {
13914         _pool_op_reply(op, -ENOENT, osdmap.get_epoch());
13915         return false;
13916       }
13917       pp.remove_unmanaged_snap(
13918         m->snapid,
13919         osdmap.require_osd_release < ceph_release_t::octopus);
13920       pending_inc.new_removed_snaps[m->pool].insert(m->snapid);
13921       // also record the new seq as purged: this avoids a discontinuity
13922       // after all of the snaps have been purged, since the seq assigned
13923       // during removal lives in the same namespace as the actual snaps.
13924       pending_pseudo_purged_snaps[m->pool].insert(pp.get_snap_seq());
13925       changed = true;
13926     }
13927     break;
13928
13929   case POOL_OP_AUID_CHANGE:
13930     _pool_op_reply(op, -EOPNOTSUPP, osdmap.get_epoch());
13931     return false;
13932
13933   default:
13934     ceph_abort();
13935     break;
13936   }
13937
13938   if (changed) {
13939     pp.set_snap_epoch(pending_inc.epoch);
13940     pending_inc.new_pools[m->pool] = pp;
13941   }
13942
13943  out:
13944   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret, pending_inc.epoch, &reply_data));
13945   return true;
13946 }
13947
13948 bool OSDMonitor::prepare_pool_op_create(MonOpRequestRef op)
13949 {
13950   op->mark_osdmon_event(__func__);
13951   int err = prepare_new_pool(op);
13952   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, err, pending_inc.epoch));
13953   return true;
13954 }
13955
13956 int OSDMonitor::_check_remove_pool(int64_t pool_id, const pg_pool_t& pool,
13957                                    ostream *ss)
13958 {
13959   const string& poolstr = osdmap.get_pool_name(pool_id);
13960
13961   // If the Pool is in use by CephFS, refuse to delete it
13962   FSMap const &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
13963   if (pending_fsmap.pool_in_use(pool_id)) {
13964     *ss << "pool '" << poolstr << "' is in use by CephFS";
13965     return -EBUSY;
13966   }
13967
13968   if (pool.tier_of >= 0) {
13969     *ss << "pool '" << poolstr << "' is a tier of '"
13970         << osdmap.get_pool_name(pool.tier_of) << "'";
13971     return -EBUSY;
13972   }
13973   if (!pool.tiers.empty()) {
13974     *ss << "pool '" << poolstr << "' has tiers";
13975     for(auto tier : pool.tiers) {
13976       *ss << " " << osdmap.get_pool_name(tier);
13977     }
13978     return -EBUSY;
13979   }
13980
13981   if (!g_conf()->mon_allow_pool_delete) {
13982     *ss << "pool deletion is disabled; you must first set the mon_allow_pool_delete config option to true before you can destroy a pool";
13983     return -EPERM;
13984   }
13985
13986   if (pool.has_flag(pg_pool_t::FLAG_NODELETE)) {
13987     *ss << "pool deletion is disabled; you must unset nodelete flag for the pool first";
13988     return -EPERM;
13989   }
13990
13991   *ss << "pool '" << poolstr << "' removed";
13992   return 0;
13993 }
13994
13995 /**
13996  * Check if it is safe to add a tier to a base pool
13997  *
13998  * @return
13999  * True if the operation should proceed, false if we should abort here
14000  * (abort doesn't necessarily mean error, could be idempotency)
14001  */
14002 bool OSDMonitor::_check_become_tier(
14003     const int64_t tier_pool_id, const pg_pool_t *tier_pool,
14004     const int64_t base_pool_id, const pg_pool_t *base_pool,
14005     int *err,
14006     ostream *ss) const
14007 {
14008   const std::string &tier_pool_name = osdmap.get_pool_name(tier_pool_id);
14009   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14010
14011   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14012   if (pending_fsmap.pool_in_use(tier_pool_id)) {
14013     *ss << "pool '" << tier_pool_name << "' is in use by CephFS";
14014     *err = -EBUSY;
14015     return false;
14016   }
14017
14018   if (base_pool->tiers.count(tier_pool_id)) {
14019     ceph_assert(tier_pool->tier_of == base_pool_id);
14020     *err = 0;
14021     *ss << "pool '" << tier_pool_name << "' is now (or already was) a tier of '"
14022       << base_pool_name << "'";
14023     return false;
14024   }
14025
14026   if (base_pool->is_tier()) {
14027     *ss << "pool '" << base_pool_name << "' is already a tier of '"
14028       << osdmap.get_pool_name(base_pool->tier_of) << "', "
14029       << "multiple tiers are not yet supported.";
14030     *err = -EINVAL;
14031     return false;
14032   }
14033
14034   if (tier_pool->has_tiers()) {
14035     *ss << "pool '" << tier_pool_name << "' has following tier(s) already:";
14036     for (set<uint64_t>::iterator it = tier_pool->tiers.begin();
14037          it != tier_pool->tiers.end(); ++it)
14038       *ss << "'" << osdmap.get_pool_name(*it) << "',";
14039     *ss << " multiple tiers are not yet supported.";
14040     *err = -EINVAL;
14041     return false;
14042   }
14043
14044   if (tier_pool->is_tier()) {
14045     *ss << "tier pool '" << tier_pool_name << "' is already a tier of '"
14046        << osdmap.get_pool_name(tier_pool->tier_of) << "'";
14047     *err = -EINVAL;
14048     return false;
14049   }
14050
14051   *err = 0;
14052   return true;
14053 }
14054
14055
14056 /**
14057  * Check if it is safe to remove a tier from this base pool
14058  *
14059  * @return
14060  * True if the operation should proceed, false if we should abort here
14061  * (abort doesn't necessarily mean error, could be idempotency)
14062  */
14063 bool OSDMonitor::_check_remove_tier(
14064     const int64_t base_pool_id, const pg_pool_t *base_pool,
14065     const pg_pool_t *tier_pool,
14066     int *err, ostream *ss) const
14067 {
14068   const std::string &base_pool_name = osdmap.get_pool_name(base_pool_id);
14069
14070   // Apply CephFS-specific checks
14071   const FSMap &pending_fsmap = mon.mdsmon()->get_pending_fsmap();
14072   if (pending_fsmap.pool_in_use(base_pool_id)) {
14073     if (base_pool->is_erasure() && !base_pool->allows_ecoverwrites()) {
14074       // If the underlying pool is erasure coded and does not allow EC
14075       // overwrites, we can't permit the removal of the replicated tier that
14076       // CephFS relies on to access it
14077       *ss << "pool '" << base_pool_name <<
14078           "' does not allow EC overwrites and is in use by CephFS"
14079           " via its tier";
14080       *err = -EBUSY;
14081       return false;
14082     }
14083
14084     if (tier_pool && tier_pool->cache_mode == pg_pool_t::CACHEMODE_WRITEBACK) {
14085       *ss << "pool '" << base_pool_name << "' is in use by CephFS, and this "
14086              "tier is still in use as a writeback cache.  Change the cache "
14087              "mode and flush the cache before removing it";
14088       *err = -EBUSY;
14089       return false;
14090     }
14091   }
14092
14093   *err = 0;
14094   return true;
14095 }
14096
14097 int OSDMonitor::_prepare_remove_pool(
14098   int64_t pool, ostream *ss, bool no_fake)
14099 {
14100   dout(10) << __func__ << " " << pool << dendl;
14101   const pg_pool_t *p = osdmap.get_pg_pool(pool);
14102   int r = _check_remove_pool(pool, *p, ss);
14103   if (r < 0)
14104     return r;
14105
14106   auto new_pool = pending_inc.new_pools.find(pool);
14107   if (new_pool != pending_inc.new_pools.end()) {
14108     // if there is a problem with the pending info, wait and retry
14109     // this op.
14110     const auto& p = new_pool->second;
14111     int r = _check_remove_pool(pool, p, ss);
14112     if (r < 0)
14113       return -EAGAIN;
14114   }
14115
14116   if (pending_inc.old_pools.count(pool)) {
14117     dout(10) << __func__ << " " << pool << " already pending removal"
14118              << dendl;
14119     return 0;
14120   }
14121
14122   if (g_conf()->mon_fake_pool_delete && !no_fake) {
14123     string old_name = osdmap.get_pool_name(pool);
14124     string new_name = old_name + "." + stringify(pool) + ".DELETED";
14125     dout(1) << __func__ << " faking pool deletion: renaming " << pool << " "
14126             << old_name << " -> " << new_name << dendl;
14127     pending_inc.new_pool_names[pool] = new_name;
14128     return 0;
14129   }
14130
14131   // remove
14132   pending_inc.old_pools.insert(pool);
14133
14134   // remove any pg_temp mappings for this pool
14135   for (auto p = osdmap.pg_temp->begin();
14136        p != osdmap.pg_temp->end();
14137        ++p) {
14138     if (p->first.pool() == pool) {
14139       dout(10) << __func__ << " " << pool << " removing obsolete pg_temp "
14140                << p->first << dendl;
14141       pending_inc.new_pg_temp[p->first].clear();
14142     }
14143   }
14144   // remove any primary_temp mappings for this pool
14145   for (auto p = osdmap.primary_temp->begin();
14146       p != osdmap.primary_temp->end();
14147       ++p) {
14148     if (p->first.pool() == pool) {
14149       dout(10) << __func__ << " " << pool
14150                << " removing obsolete primary_temp" << p->first << dendl;
14151       pending_inc.new_primary_temp[p->first] = -1;
14152     }
14153   }
14154   // remove any pg_upmap mappings for this pool
14155   for (auto& p : osdmap.pg_upmap) {
14156     if (p.first.pool() == pool) {
14157       dout(10) << __func__ << " " << pool
14158                << " removing obsolete pg_upmap "
14159                << p.first << dendl;
14160       pending_inc.old_pg_upmap.insert(p.first);
14161     }
14162   }
14163   // remove any pending pg_upmap mappings for this pool
14164   {
14165     auto it = pending_inc.new_pg_upmap.begin();
14166     while (it != pending_inc.new_pg_upmap.end()) {
14167       if (it->first.pool() == pool) {
14168         dout(10) << __func__ << " " << pool
14169                  << " removing pending pg_upmap "
14170                  << it->first << dendl;
14171         it = pending_inc.new_pg_upmap.erase(it);
14172       } else {
14173         it++;
14174       }
14175     }
14176   }
14177   // remove any pg_upmap_items mappings for this pool
14178   for (auto& p : osdmap.pg_upmap_items) {
14179     if (p.first.pool() == pool) {
14180       dout(10) << __func__ << " " << pool
14181                << " removing obsolete pg_upmap_items " << p.first
14182                << dendl;
14183       pending_inc.old_pg_upmap_items.insert(p.first);
14184     }
14185   }
14186   // remove any pending pg_upmap mappings for this pool
14187   {
14188     auto it = pending_inc.new_pg_upmap_items.begin();
14189     while (it != pending_inc.new_pg_upmap_items.end()) {
14190       if (it->first.pool() == pool) {
14191         dout(10) << __func__ << " " << pool
14192                  << " removing pending pg_upmap_items "
14193                  << it->first << dendl;
14194         it = pending_inc.new_pg_upmap_items.erase(it);
14195       } else {
14196         it++;
14197       }
14198     }
14199   }
14200
14201   // remove any choose_args for this pool
14202   CrushWrapper newcrush;
14203   _get_pending_crush(newcrush);
14204   if (newcrush.have_choose_args(pool)) {
14205     dout(10) << __func__ << " removing choose_args for pool " << pool << dendl;
14206     newcrush.rm_choose_args(pool);
14207     pending_inc.crush.clear();
14208     newcrush.encode(pending_inc.crush, mon.get_quorum_con_features());
14209   }
14210   return 0;
14211 }
14212
14213 int OSDMonitor::_prepare_rename_pool(int64_t pool, string newname)
14214 {
14215   dout(10) << "_prepare_rename_pool " << pool << dendl;
14216   if (pending_inc.old_pools.count(pool)) {
14217     dout(10) << "_prepare_rename_pool " << pool << " pending removal" << dendl;
14218     return -ENOENT;
14219   }
14220   for (map<int64_t,string>::iterator p = pending_inc.new_pool_names.begin();
14221        p != pending_inc.new_pool_names.end();
14222        ++p) {
14223     if (p->second == newname && p->first != pool) {
14224       return -EEXIST;
14225     }
14226   }
14227
14228   pending_inc.new_pool_names[pool] = newname;
14229   return 0;
14230 }
14231
14232 bool OSDMonitor::prepare_pool_op_delete(MonOpRequestRef op)
14233 {
14234   op->mark_osdmon_event(__func__);
14235   auto m = op->get_req<MPoolOp>();
14236   ostringstream ss;
14237   int ret = _prepare_remove_pool(m->pool, &ss, false);
14238   if (ret == -EAGAIN) {
14239     wait_for_finished_proposal(op, new C_RetryMessage(this, op));
14240     return true;
14241   }
14242   if (ret < 0)
14243     dout(10) << __func__ << " got " << ret << " " << ss.str() << dendl;
14244   wait_for_finished_proposal(op, new OSDMonitor::C_PoolOp(this, op, ret,
14245                                                       pending_inc.epoch));
14246   return true;
14247 }
14248
14249 void OSDMonitor::_pool_op_reply(MonOpRequestRef op,
14250                                 int ret, epoch_t epoch, bufferlist *blp)
14251 {
14252   op->mark_osdmon_event(__func__);
14253   auto m = op->get_req<MPoolOp>();
14254   dout(20) << "_pool_op_reply " << ret << dendl;
14255   MPoolOpReply *reply = new MPoolOpReply(m->fsid, m->get_tid(),
14256                                          ret, epoch, get_last_committed(), blp);
14257   mon.send_reply(op, reply);
14258 }
14259
14260 void OSDMonitor::convert_pool_priorities(void)
14261 {
14262   pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key;
14263   int64_t max_prio = 0;
14264   int64_t min_prio = 0;
14265   for (const auto &i : osdmap.get_pools()) {
14266     const auto &pool = i.second;
14267
14268     if (pool.opts.is_set(key)) {
14269       int64_t prio = 0;
14270       pool.opts.get(key, &prio);
14271       if (prio > max_prio)
14272         max_prio = prio;
14273       if (prio < min_prio)
14274         min_prio = prio;
14275     }
14276   }
14277   if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) {
14278     dout(20) << __func__ << " nothing to fix" << dendl;
14279     return;
14280   }
14281   // Current pool priorities exceeds new maximum
14282   for (const auto &i : osdmap.get_pools()) {
14283     const auto pool_id = i.first;
14284     pg_pool_t pool = i.second;
14285
14286     int64_t prio = 0;
14287     pool.opts.get(key, &prio);
14288     int64_t n;
14289
14290     if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario
14291       // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX
14292       n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX;
14293     } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) {
14294       // Scaled  priority range OSD_POOL_PRIORITY_MIN to 0
14295       n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN;
14296     } else {
14297       continue;
14298     }
14299     if (n == 0) {
14300       pool.opts.unset(key);
14301     } else {
14302       pool.opts.set(key, static_cast<int64_t>(n));
14303     }
14304     dout(10) << __func__ << " pool " << pool_id
14305              << " recovery_priority adjusted "
14306              << prio << " to " << n << dendl;
14307     pool.last_change = pending_inc.epoch;
14308     pending_inc.new_pools[pool_id] = pool;
14309   }
14310 }
14311
14312 void OSDMonitor::try_enable_stretch_mode_pools(stringstream& ss, bool *okay,
14313                                                int *errcode,
14314                                                set<pg_pool_t*>* pools,
14315                                                const string& new_crush_rule)
14316 {
14317   dout(20) << __func__ << dendl;
14318   *okay = false;
14319   int new_crush_rule_result = osdmap.crush->get_rule_id(new_crush_rule);
14320   if (new_crush_rule_result < 0) {
14321     ss << "unrecognized crush rule " << new_crush_rule_result;
14322     *errcode = new_crush_rule_result;
14323     return;
14324   }
14325   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14326   for (const auto& pooli : osdmap.pools) {
14327     int64_t poolid = pooli.first;
14328     const pg_pool_t *p = &pooli.second;
14329     if (!p->is_replicated()) {
14330       ss << "stretched pools must be replicated; '" << osdmap.pool_name[poolid] << "' is erasure-coded";
14331       *errcode = -EINVAL;
14332       return;
14333     }
14334     uint8_t default_size = g_conf().get_val<uint64_t>("osd_pool_default_size");
14335     if ((p->get_size() != default_size ||
14336          (p->get_min_size() != g_conf().get_osd_pool_default_min_size(default_size))) &&
14337         (p->get_crush_rule() != new_rule)) {
14338       ss << "we currently require stretch mode pools start out with the"
14339         " default size/min_size, which '" << osdmap.pool_name[poolid] << "' does not";
14340       *errcode = -EINVAL;
14341       return;
14342     }
14343     pg_pool_t *pp = pending_inc.get_new_pool(poolid, p);
14344     // TODO: The part where we unconditionally copy the pools into pending_inc is bad
14345     // the attempt may fail and then we have these pool updates...but they won't do anything
14346     // if there is a failure, so if it's hard to change the interface, no need to bother
14347     pools->insert(pp);
14348   }
14349   *okay = true;
14350   return;
14351 }
14352
14353 void OSDMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
14354                                          int *errcode, bool commit,
14355                                          const string& dividing_bucket,
14356                                          uint32_t bucket_count,
14357                                          const set<pg_pool_t*>& pools,
14358                                          const string& new_crush_rule)
14359 {
14360   dout(20) << __func__ << dendl;
14361   *okay = false;
14362   CrushWrapper crush;
14363   _get_pending_crush(crush);
14364   int dividing_id;
14365   int retval = crush.get_validated_type_id(dividing_bucket, &dividing_id);
14366   if (retval == -1) {
14367     ss << dividing_bucket << " is not a valid crush bucket type";
14368     *errcode = -ENOENT;
14369     ceph_assert(!commit || retval != -1);
14370     return;
14371   }
14372   vector<int> subtrees;
14373   crush.get_subtree_of_type(dividing_id, &subtrees);
14374   if (subtrees.size() != 2) {
14375     ss << "there are " << subtrees.size() << dividing_bucket
14376        << "'s in the cluster but stretch mode currently only works with 2!";
14377     *errcode = -EINVAL;
14378     ceph_assert(!commit || subtrees.size() == 2);
14379     return;
14380   }
14381
14382   int new_crush_rule_result = crush.get_rule_id(new_crush_rule);
14383   if (new_crush_rule_result < 0) {
14384     ss << "unrecognized crush rule " << new_crush_rule;
14385     *errcode = new_crush_rule_result;
14386     ceph_assert(!commit || (new_crush_rule_result > 0));
14387     return;
14388   }
14389   __u8 new_rule = static_cast<__u8>(new_crush_rule_result);
14390
14391   int weight1 = crush.get_item_weight(subtrees[0]);
14392   int weight2 = crush.get_item_weight(subtrees[1]);
14393   if (weight1 != weight2) {
14394     // TODO: I'm really not sure this is a good idea?
14395     ss << "the 2 " << dividing_bucket
14396        << "instances in the cluster have differing weights "
14397        << weight1 << " and " << weight2
14398        <<" but stretch mode currently requires they be the same!";
14399     *errcode = -EINVAL;
14400     ceph_assert(!commit || (weight1 == weight2));
14401     return;
14402   }
14403   if (bucket_count != 2) {
14404     ss << "currently we only support 2-site stretch clusters!";
14405     *errcode = -EINVAL;
14406     ceph_assert(!commit || bucket_count == 2);
14407     return;
14408   }
14409   // TODO: check CRUSH rules for pools so that we are appropriately divided
14410   if (commit) {
14411     for (auto pool : pools) {
14412       pool->crush_rule = new_rule;
14413       pool->peering_crush_bucket_count = bucket_count;
14414       pool->peering_crush_bucket_target = bucket_count;
14415       pool->peering_crush_bucket_barrier = dividing_id;
14416       pool->peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14417       pool->size = g_conf().get_val<uint64_t>("mon_stretch_pool_size");
14418       pool->min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14419     }
14420     pending_inc.change_stretch_mode = true;
14421     pending_inc.stretch_mode_enabled = true;
14422     pending_inc.new_stretch_bucket_count = bucket_count;
14423     pending_inc.new_degraded_stretch_mode = 0;
14424     pending_inc.new_stretch_mode_bucket = dividing_id;
14425   }
14426   *okay = true;
14427   return;
14428 }
14429
14430 bool OSDMonitor::check_for_dead_crush_zones(const map<string,set<string>>& dead_buckets,
14431                                             set<int> *really_down_buckets,
14432                                             set<string> *really_down_mons)
14433 {
14434   dout(20) << __func__ << " with dead mon zones " << dead_buckets << dendl;
14435   ceph_assert(is_readable());
14436   if (dead_buckets.empty()) return false;
14437   set<int> down_cache;
14438   bool really_down = false;
14439   for (auto dbi : dead_buckets) {
14440     const string& bucket_name = dbi.first;
14441     ceph_assert(osdmap.crush->name_exists(bucket_name));
14442     int bucket_id = osdmap.crush->get_item_id(bucket_name);
14443     dout(20) << "Checking " << bucket_name << " id " << bucket_id
14444              << " to see if OSDs are also down" << dendl;
14445     bool subtree_down = osdmap.subtree_is_down(bucket_id, &down_cache);
14446     if (subtree_down) {
14447       dout(20) << "subtree is down!" << dendl;
14448       really_down = true;
14449       really_down_buckets->insert(bucket_id);
14450       really_down_mons->insert(dbi.second.begin(), dbi.second.end());
14451     }
14452   }
14453   dout(10) << "We determined CRUSH buckets " << *really_down_buckets
14454            << " and mons " << *really_down_mons << " are really down" << dendl;
14455   return really_down;
14456 }
14457
14458 void OSDMonitor::trigger_degraded_stretch_mode(const set<int>& dead_buckets,
14459                                                const set<string>& live_zones)
14460 {
14461   dout(20) << __func__ << dendl;
14462   stretch_recovery_triggered.set_from_double(0); // reset this; we can't go clean now!
14463   // update the general OSDMap changes
14464   pending_inc.change_stretch_mode = true;
14465   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14466   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14467   int new_site_count = osdmap.stretch_bucket_count - dead_buckets.size();
14468   ceph_assert(new_site_count == 1); // stretch count 2!
14469   pending_inc.new_degraded_stretch_mode = new_site_count;
14470   pending_inc.new_recovering_stretch_mode = 0;
14471   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14472
14473   // and then apply them to all the pg_pool_ts
14474   ceph_assert(live_zones.size() == 1); // only support 2 zones now
14475   const string& remaining_site_name = *(live_zones.begin());
14476   ceph_assert(osdmap.crush->name_exists(remaining_site_name));
14477   int remaining_site = osdmap.crush->get_item_id(remaining_site_name);
14478   for (auto pgi : osdmap.pools) {
14479     if (pgi.second.peering_crush_bucket_count) {
14480       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14481       newp.peering_crush_bucket_count = new_site_count;
14482       newp.peering_crush_mandatory_member = remaining_site;
14483       newp.min_size = pgi.second.min_size / 2; // only support 2 zones now
14484       newp.last_force_op_resend = pending_inc.epoch;
14485     }
14486   }
14487   propose_pending();
14488 }
14489
14490 void OSDMonitor::trigger_recovery_stretch_mode()
14491 {
14492   dout(20) << __func__ << dendl;
14493   stretch_recovery_triggered.set_from_double(0); // reset this so we don't go full-active prematurely
14494   pending_inc.change_stretch_mode = true;
14495   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14496   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14497   pending_inc.new_degraded_stretch_mode = osdmap.degraded_stretch_mode;
14498   pending_inc.new_recovering_stretch_mode = 1;
14499   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14500
14501   for (auto pgi : osdmap.pools) {
14502     if (pgi.second.peering_crush_bucket_count) {
14503       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14504       newp.last_force_op_resend = pending_inc.epoch;
14505     }
14506   }
14507   propose_pending();
14508 }
14509
14510 void OSDMonitor::notify_new_pg_digest()
14511 {
14512   dout(20) << __func__ << dendl;
14513   if (!stretch_recovery_triggered.is_zero()) {
14514     try_end_recovery_stretch_mode(false);
14515   }
14516 }
14517
14518 struct CMonExitRecovery : public Context {
14519   OSDMonitor *m;
14520   bool force;
14521   CMonExitRecovery(OSDMonitor *mon, bool f) : m(mon), force(f) {}
14522   void finish(int r) {
14523     m->try_end_recovery_stretch_mode(force);
14524   }
14525 };
14526
14527 void OSDMonitor::try_end_recovery_stretch_mode(bool force)
14528 {
14529   dout(20) << __func__ << dendl;
14530   if (!mon.is_leader()) return;
14531   if (!mon.is_degraded_stretch_mode()) return;
14532   if (!mon.is_recovering_stretch_mode()) return;
14533   if (!is_readable()) {
14534     wait_for_readable_ctx(new CMonExitRecovery(this, force));
14535     return;
14536   }
14537
14538   if (osdmap.recovering_stretch_mode &&
14539       ((!stretch_recovery_triggered.is_zero() &&
14540         ceph_clock_now() - g_conf().get_val<double>("mon_stretch_recovery_min_wait") >
14541         stretch_recovery_triggered) ||
14542        force)) {
14543     if (!mon.mgrstatmon()->is_readable()) {
14544       mon.mgrstatmon()->wait_for_readable_ctx(new CMonExitRecovery(this, force));
14545       return;
14546     }
14547     const PGMapDigest& pgd = mon.mgrstatmon()->get_digest();
14548     double misplaced, degraded, inactive, unknown;
14549     pgd.get_recovery_stats(&misplaced, &degraded, &inactive, &unknown);
14550     if (force || (degraded == 0.0 && inactive == 0.0 && unknown == 0.0)) {
14551       // we can exit degraded stretch mode!
14552       mon.trigger_healthy_stretch_mode();
14553     }
14554   }
14555 }
14556
14557 void OSDMonitor::trigger_healthy_stretch_mode()
14558 {
14559   ceph_assert(is_writeable());
14560   stretch_recovery_triggered.set_from_double(0);
14561   pending_inc.change_stretch_mode = true;
14562   pending_inc.stretch_mode_enabled = osdmap.stretch_mode_enabled;
14563   pending_inc.new_stretch_bucket_count = osdmap.stretch_bucket_count;
14564   pending_inc.new_degraded_stretch_mode = 0; // turn off degraded mode...
14565   pending_inc.new_recovering_stretch_mode = 0; //...and recovering mode!
14566   pending_inc.new_stretch_mode_bucket = osdmap.stretch_mode_bucket;
14567   for (auto pgi : osdmap.pools) {
14568     if (pgi.second.peering_crush_bucket_count) {
14569       pg_pool_t& newp = *pending_inc.get_new_pool(pgi.first, &pgi.second);
14570       newp.peering_crush_bucket_count = osdmap.stretch_bucket_count;
14571       newp.peering_crush_mandatory_member = CRUSH_ITEM_NONE;
14572       newp.min_size = g_conf().get_val<uint64_t>("mon_stretch_pool_min_size");
14573       newp.last_force_op_resend = pending_inc.epoch;
14574     }
14575   }
14576   propose_pending();
14577 }